Skip to content

Commit

Permalink
clang: improve general performance with vectorization/unrolling
Browse files Browse the repository at this point in the history
Clang has a tendency to *heavily* unroll loops all over the place:
    llvm/llvm-project#42332

Disable loop unrolling wherever it goes too nuts, enable vectorization
where it doesn't do so automatically, etc.

Signed-off-by: Steven Noonan <steven@uplinklabs.net>
  • Loading branch information
tycho committed May 8, 2024
1 parent d4a9140 commit a2b8ffe
Showing 1 changed file with 51 additions and 32 deletions.
83 changes: 51 additions & 32 deletions miniaudio.h
Original file line number Diff line number Diff line change
Expand Up @@ -43001,7 +43001,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
}
}

MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
{
ma_uint64 iSample;

Expand Down Expand Up @@ -43296,10 +43296,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
sampleCount = frameCount * channels;

if (volume == 1) {
#pragma clang loop vectorize(enable)
for (iSample = 0; iSample < sampleCount; iSample += 1) {
pDst[iSample] += pSrc[iSample];
}
} else {
#pragma clang loop vectorize(enable)
for (iSample = 0; iSample < sampleCount; iSample += 1) {
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
}
Expand Down Expand Up @@ -45602,7 +45604,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
const float a1 = pBQ->a1.f32;
const float a2 = pBQ->a2.f32;

MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
#pragma clang loop vectorize(assume_safety)
for (c = 0; c < channels; c += 1) {
float r1 = pBQ->pR1[c].f32;
float r2 = pBQ->pR2[c].f32;
Expand Down Expand Up @@ -45634,7 +45637,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
const ma_int32 a1 = pBQ->a1.s32;
const ma_int32 a2 = pBQ->a2.s32;

MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
#pragma clang loop vectorize(assume_safety)
for (c = 0; c < channels; c += 1) {
ma_int32 r1 = pBQ->pR1[c].s32;
ma_int32 r2 = pBQ->pR2[c].s32;
Expand Down Expand Up @@ -45908,22 +45912,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
return MA_SUCCESS;
}

static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 *pLPF, float *pY, const float *pX)
{
ma_uint32 c;
const ma_uint32 channels = pLPF->channels;
const float a = pLPF->a.f32;
const float b = 1 - a;

MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
#pragma clang loop vectorize(assume_safety)
for (c = 0; c < channels; c += 1) {
float r1 = pLPF->pR1[c].f32;
float x = pX[c];
float x = pX[c];
float y;

y = b*x + a*r1;
y = b * x + a * r1;

pY[c] = y;
pY[c] = y;
pLPF->pR1[c].f32 = y;
}
}
Expand All @@ -45935,7 +45940,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
const ma_int32 a = pLPF->a.s32;
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);

MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
#pragma clang loop vectorize(assume_safety)
for (c = 0; c < channels; c += 1) {
ma_int32 r1 = pLPF->pR1[c].s32;
ma_int32 x = pX[c];
Expand Down Expand Up @@ -46788,7 +46794,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
const float a = 1 - pHPF->a.f32;
const float b = 1 - a;

MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
for (c = 0; c < channels; c += 1) {
float r1 = pHPF->pR1[c].f32;
float x = pX[c];
Expand All @@ -46808,7 +46814,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);

MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
for (c = 0; c < channels; c += 1) {
ma_int32 r1 = pHPF->pR1[c].s32;
ma_int32 x = pX[c];
Expand Down Expand Up @@ -48916,6 +48922,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
ma_uint64 iFrame;
ma_uint32 iChannel;
ma_uint64 interpolatedFrameCount;
const ma_uint32 channels = pGainer->config.channels;

MA_ASSERT(pGainer != NULL);

Expand Down Expand Up @@ -48955,12 +48962,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
float d = 1.0f / pGainer->config.smoothTimeInFrames;

if (pGainer->config.channels <= 32) {
if (channels <= 32) {
float pRunningGain[32];
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */

/* Initialize the running gain. */
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
for (iChannel = 0; iChannel < channels; iChannel += 1) {
float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
pRunningGainDelta[iChannel] = t * d;
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
Expand All @@ -48969,7 +48976,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
iFrame = 0;

/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
if (pGainer->config.channels == 2) {
if (channels == 2) {
#if defined(MA_SUPPORT_SSE2)
if (ma_has_sse2()) {
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
Expand Down Expand Up @@ -49017,6 +49024,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte

iFrame = unrolledLoopCount << 1;
#else
#pragma clang loop vectorize(enable)
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
for (iChannel = 0; iChannel < 2; iChannel += 1) {
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
Expand All @@ -49028,7 +49036,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
}
#endif
}
} else if (pGainer->config.channels == 6) {
} else if (channels == 6) {
#if defined(MA_SUPPORT_SSE2)
if (ma_has_sse2()) {
/*
Expand Down Expand Up @@ -49072,7 +49080,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
}
}
}
} else if (pGainer->config.channels == 8) {
} else if (channels == 8) {
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
#if defined(MA_SUPPORT_SSE2)
if (ma_has_sse2()) {
Expand All @@ -49093,29 +49101,35 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
{
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
#pragma clang loop vectorize(enable)
for (iChannel = 0; iChannel < 8; iChannel += 1) {
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
}

/* Move the running gain forward towards the new gain. */
#pragma clang loop vectorize(enable)
for (iChannel = 0; iChannel < 8; iChannel += 1) {
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
}
}
}
}

#pragma clang loop unroll(disable)
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
#pragma clang loop vectorize(enable)
for (iChannel = 0; iChannel < channels; iChannel += 1) {
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
}
}
} else {
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
#pragma clang loop unroll(disable)
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
#pragma clang loop vectorize(enable)
for (iChannel = 0; iChannel < channels; iChannel += 1) {
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
}

a += d;
Expand All @@ -49134,18 +49148,20 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte

/* All we need to do here is apply the new gains using an optimized path. */
if (pFramesOut != NULL && pFramesIn != NULL) {
if (pGainer->config.channels <= 32) {
if (channels <= 32) {
float gains[32];
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
for (iChannel = 0; iChannel < channels; iChannel += 1) {
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
}

ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config.channels, gains);
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
} else {
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
#pragma clang loop unroll(disable)
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
#pragma clang loop vectorize(enable)
for (iChannel = 0; iChannel < channels; iChannel += 1) {
((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
}
}
}
Expand Down Expand Up @@ -51547,7 +51563,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa

a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;

MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
for (c = 0; c < channels; c += 1) {
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
pFrameOut[c] = s;
Expand All @@ -51566,7 +51582,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa

a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;

MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
for (c = 0; c < channels; c += 1) {
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
pFrameOut[c] = s;
Expand Down Expand Up @@ -51737,7 +51753,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16(ma_linear_resampler*
}


static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
{
const float* pFramesInF32;
/* */ float* pFramesOutF32;
Expand Down Expand Up @@ -51813,7 +51829,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear
return MA_SUCCESS;
}

static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
{
const float* pFramesInF32;
/* */ float* pFramesOutF32;
Expand Down Expand Up @@ -53126,6 +53142,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
#endif
{
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
#pragma clang loop vectorize(enable)
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
}
Expand Down Expand Up @@ -53153,6 +53170,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
#endif
{
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
#pragma clang loop vectorize(enable)
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
}
Expand All @@ -53170,6 +53188,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
#endif
{
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
#pragma clang loop vectorize(enable)
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
}
Expand Down Expand Up @@ -66915,7 +66934,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
ma_uint64 iFrame;
ma_uint32 iChannel;
const ma_uint32 channels = pNoise->config.channels;
MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);

if (pNoise->config.format == ma_format_f32) {
float* pFramesOutF32 = (float*)pFramesOut;
Expand Down Expand Up @@ -67034,7 +67053,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
ma_uint64 iFrame;
ma_uint32 iChannel;
const ma_uint32 channels = pNoise->config.channels;
MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);

if (pNoise->config.format == ma_format_f32) {
float* pFramesOutF32 = (float*)pFramesOut;
Expand Down Expand Up @@ -67116,7 +67135,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
ma_uint64 iFrame;
ma_uint32 iChannel;
const ma_uint32 channels = pNoise->config.channels;
MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);

if (pNoise->config.format == ma_format_f32) {
float* pFramesOutF32 = (float*)pFramesOut;
Expand Down

0 comments on commit a2b8ffe

Please sign in to comment.