Skip to content

Commit

Permalink
clang: improve general performance with vectorization/unrolling
Browse files Browse the repository at this point in the history
Clang has a tendency to *heavily* unroll loops all over the place:
    llvm/llvm-project#42332

Disable loop unrolling wherever it goes too nuts, enable vectorization
where it doesn't do so automatically, etc.

Signed-off-by: Steven Noonan <steven@uplinklabs.net>
  • Loading branch information
tycho committed Apr 22, 2024
1 parent 0476f37 commit a947789
Showing 1 changed file with 51 additions and 32 deletions.
83 changes: 51 additions & 32 deletions miniaudio.h
Original file line number Diff line number Diff line change
Expand Up @@ -42972,7 +42972,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
}
}

MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
{
ma_uint64 iSample;

Expand Down Expand Up @@ -43267,10 +43267,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
sampleCount = frameCount * channels;

if (volume == 1) {
#pragma clang loop vectorize(enable)
for (iSample = 0; iSample < sampleCount; iSample += 1) {
pDst[iSample] += pSrc[iSample];
}
} else {
#pragma clang loop vectorize(enable)
for (iSample = 0; iSample < sampleCount; iSample += 1) {
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
}
Expand Down Expand Up @@ -45573,7 +45575,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
const float a1 = pBQ->a1.f32;
const float a2 = pBQ->a2.f32;

MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
#pragma clang loop vectorize(assume_safety)
for (c = 0; c < channels; c += 1) {
float r1 = pBQ->pR1[c].f32;
float r2 = pBQ->pR2[c].f32;
Expand Down Expand Up @@ -45605,7 +45608,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
const ma_int32 a1 = pBQ->a1.s32;
const ma_int32 a2 = pBQ->a2.s32;

MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
#pragma clang loop vectorize(assume_safety)
for (c = 0; c < channels; c += 1) {
ma_int32 r1 = pBQ->pR1[c].s32;
ma_int32 r2 = pBQ->pR2[c].s32;
Expand Down Expand Up @@ -45879,22 +45883,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
return MA_SUCCESS;
}

static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 *pLPF, float *pY, const float *pX)
{
ma_uint32 c;
const ma_uint32 channels = pLPF->channels;
const float a = pLPF->a.f32;
const float b = 1 - a;

MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
#pragma clang loop vectorize(assume_safety)
for (c = 0; c < channels; c += 1) {
float r1 = pLPF->pR1[c].f32;
float x = pX[c];
float x = pX[c];
float y;

y = b*x + a*r1;
y = b * x + a * r1;

pY[c] = y;
pY[c] = y;
pLPF->pR1[c].f32 = y;
}
}
Expand All @@ -45906,7 +45911,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
const ma_int32 a = pLPF->a.s32;
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);

MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
#pragma clang loop vectorize(assume_safety)
for (c = 0; c < channels; c += 1) {
ma_int32 r1 = pLPF->pR1[c].s32;
ma_int32 x = pX[c];
Expand Down Expand Up @@ -46759,7 +46765,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
const float a = 1 - pHPF->a.f32;
const float b = 1 - a;

MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
for (c = 0; c < channels; c += 1) {
float r1 = pHPF->pR1[c].f32;
float x = pX[c];
Expand All @@ -46779,7 +46785,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);

MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
for (c = 0; c < channels; c += 1) {
ma_int32 r1 = pHPF->pR1[c].s32;
ma_int32 x = pX[c];
Expand Down Expand Up @@ -48887,6 +48893,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
ma_uint64 iFrame;
ma_uint32 iChannel;
ma_uint64 interpolatedFrameCount;
const ma_uint32 channels = pGainer->config.channels;

MA_ASSERT(pGainer != NULL);

Expand Down Expand Up @@ -48926,12 +48933,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
float d = 1.0f / pGainer->config.smoothTimeInFrames;

if (pGainer->config.channels <= 32) {
if (channels <= 32) {
float pRunningGain[32];
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */

/* Initialize the running gain. */
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
for (iChannel = 0; iChannel < channels; iChannel += 1) {
float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
pRunningGainDelta[iChannel] = t * d;
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
Expand All @@ -48940,7 +48947,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
iFrame = 0;

/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
if (pGainer->config.channels == 2) {
if (channels == 2) {
#if defined(MA_SUPPORT_SSE2)
if (ma_has_sse2()) {
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
Expand Down Expand Up @@ -48988,6 +48995,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte

iFrame = unrolledLoopCount << 1;
#else
#pragma clang loop vectorize(enable)
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
for (iChannel = 0; iChannel < 2; iChannel += 1) {
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
Expand All @@ -48999,7 +49007,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
}
#endif
}
} else if (pGainer->config.channels == 6) {
} else if (channels == 6) {
#if defined(MA_SUPPORT_SSE2)
if (ma_has_sse2()) {
/*
Expand Down Expand Up @@ -49043,7 +49051,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
}
}
}
} else if (pGainer->config.channels == 8) {
} else if (channels == 8) {
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
#if defined(MA_SUPPORT_SSE2)
if (ma_has_sse2()) {
Expand All @@ -49064,29 +49072,35 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
{
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
#pragma clang loop vectorize(enable)
for (iChannel = 0; iChannel < 8; iChannel += 1) {
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
}

/* Move the running gain forward towards the new gain. */
#pragma clang loop vectorize(enable)
for (iChannel = 0; iChannel < 8; iChannel += 1) {
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
}
}
}
}

#pragma clang loop unroll(disable)
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
#pragma clang loop vectorize(enable)
for (iChannel = 0; iChannel < channels; iChannel += 1) {
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
}
}
} else {
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
#pragma clang loop unroll(disable)
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
#pragma clang loop vectorize(enable)
for (iChannel = 0; iChannel < channels; iChannel += 1) {
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
}

a += d;
Expand All @@ -49105,18 +49119,20 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte

/* All we need to do here is apply the new gains using an optimized path. */
if (pFramesOut != NULL && pFramesIn != NULL) {
if (pGainer->config.channels <= 32) {
if (channels <= 32) {
float gains[32];
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
for (iChannel = 0; iChannel < channels; iChannel += 1) {
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
}

ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config.channels, gains);
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
} else {
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
#pragma clang loop unroll(disable)
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
#pragma clang loop vectorize(enable)
for (iChannel = 0; iChannel < channels; iChannel += 1) {
((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
}
}
}
Expand Down Expand Up @@ -51518,7 +51534,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa

a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;

MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
for (c = 0; c < channels; c += 1) {
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
pFrameOut[c] = s;
Expand All @@ -51537,7 +51553,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa

a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;

MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
for (c = 0; c < channels; c += 1) {
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
pFrameOut[c] = s;
Expand Down Expand Up @@ -51708,7 +51724,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16(ma_linear_resampler*
}


static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
{
const float* pFramesInF32;
/* */ float* pFramesOutF32;
Expand Down Expand Up @@ -51784,7 +51800,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear
return MA_SUCCESS;
}

static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
{
const float* pFramesInF32;
/* */ float* pFramesOutF32;
Expand Down Expand Up @@ -53097,6 +53113,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
#endif
{
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
#pragma clang loop vectorize(enable)
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
}
Expand Down Expand Up @@ -53124,6 +53141,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
#endif
{
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
#pragma clang loop vectorize(enable)
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
}
Expand All @@ -53141,6 +53159,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
#endif
{
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
#pragma clang loop vectorize(enable)
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
}
Expand Down Expand Up @@ -66886,7 +66905,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
ma_uint64 iFrame;
ma_uint32 iChannel;
const ma_uint32 channels = pNoise->config.channels;
MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);

if (pNoise->config.format == ma_format_f32) {
float* pFramesOutF32 = (float*)pFramesOut;
Expand Down Expand Up @@ -67005,7 +67024,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
ma_uint64 iFrame;
ma_uint32 iChannel;
const ma_uint32 channels = pNoise->config.channels;
MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);

if (pNoise->config.format == ma_format_f32) {
float* pFramesOutF32 = (float*)pFramesOut;
Expand Down Expand Up @@ -67087,7 +67106,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
ma_uint64 iFrame;
ma_uint32 iChannel;
const ma_uint32 channels = pNoise->config.channels;
MA_ASSUME(channels > 0);
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);

if (pNoise->config.format == ma_format_f32) {
float* pFramesOutF32 = (float*)pFramesOut;
Expand Down

0 comments on commit a947789

Please sign in to comment.