Commit 5c358a75 authored by David Reid's avatar David Reid

Some vectorization improvements to ma_gainer.

parent 89cc773c
...@@ -3899,6 +3899,14 @@ typedef ma_uint16 wchar_t; ...@@ -3899,6 +3899,14 @@ typedef ma_uint16 wchar_t;
#endif #endif
#endif #endif
#ifndef MA_RESTRICT
#if defined(__clang__) || defined(__GNUC__) || defined(_MSC_VER)
#define MA_RESTRICT __restrict
#else
#define MA_RESTRICT
#endif
#endif
/* SIMD alignment in bytes. Currently set to 32 bytes in preparation for future AVX optimizations. */ /* SIMD alignment in bytes. Currently set to 32 bytes in preparation for future AVX optimizations. */
#define MA_SIMD_ALIGNMENT 32 #define MA_SIMD_ALIGNMENT 32
...@@ -11520,7 +11528,8 @@ static MA_INLINE ma_bool32 ma_has_neon(void) ...@@ -11520,7 +11528,8 @@ static MA_INLINE ma_bool32 ma_has_neon(void)
#define MA_SIMD_NEON 3 #define MA_SIMD_NEON 3
#ifndef MA_PREFERRED_SIMD #ifndef MA_PREFERRED_SIMD
# if defined(MA_SUPPORT_SSE2) && defined(MA_PREFER_SSE2) /* Prefer SSE2 over AVX2 if AVX2 has not bee explicitly requested. */
# if defined(MA_SUPPORT_SSE2) && (defined(MA_PREFER_SSE2) || !defined(MA_PREFER_AVX2))
#define MA_PREFERRED_SIMD MA_SIMD_SSE2 #define MA_PREFERRED_SIMD MA_SIMD_SSE2
#elif defined(MA_SUPPORT_AVX2) && defined(MA_PREFER_AVX2) #elif defined(MA_SUPPORT_AVX2) && defined(MA_PREFER_AVX2)
#define MA_PREFERRED_SIMD MA_SIMD_AVX2 #define MA_PREFERRED_SIMD MA_SIMD_AVX2
...@@ -11549,14 +11558,6 @@ static MA_INLINE ma_bool32 ma_has_neon(void) ...@@ -11549,14 +11558,6 @@ static MA_INLINE ma_bool32 ma_has_neon(void)
#endif #endif
#endif #endif
#ifndef MA_RESTRICT
#if defined(__clang__) || defined(__GNUC__) || defined(_MSC_VER)
#define MA_RESTRICT __restrict
#else
#define MA_RESTRICT
#endif
#endif
#if defined(_MSC_VER) && _MSC_VER >= 1400 #if defined(_MSC_VER) && _MSC_VER >= 1400
#define MA_HAS_BYTESWAP16_INTRINSIC #define MA_HAS_BYTESWAP16_INTRINSIC
#define MA_HAS_BYTESWAP32_INTRINSIC #define MA_HAS_BYTESWAP32_INTRINSIC
...@@ -48209,27 +48210,44 @@ MA_API ma_result ma_gainer_process_pcm_frames(ma_gainer* pGainer, void* pFramesO ...@@ -48209,27 +48210,44 @@ MA_API ma_result ma_gainer_process_pcm_frames(ma_gainer* pGainer, void* pFramesO
} }
iFrame = unrolledLoopCount << 1; iFrame = unrolledLoopCount << 1;
} else if (pGainer->config.channels == 8) { }
else if (pGainer->config.channels == 8) {
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */ /* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
for (; iFrame < interpolatedFrameCount; iFrame += 1) { #if defined(MA_SUPPORT_SSE2)
pFramesOutF32[iFrame*8 + 0] = pFramesInF32[iFrame*8 + 0] * pRunningGain[0]; if (ma_has_sse2()) {
pFramesOutF32[iFrame*8 + 1] = pFramesInF32[iFrame*8 + 1] * pRunningGain[1]; __m128 runningGain0 = _mm_loadu_ps(&pRunningGain[0]);
pFramesOutF32[iFrame*8 + 2] = pFramesInF32[iFrame*8 + 2] * pRunningGain[2]; __m128 runningGain1 = _mm_loadu_ps(&pRunningGain[4]);
pFramesOutF32[iFrame*8 + 3] = pFramesInF32[iFrame*8 + 3] * pRunningGain[3]; __m128 runningGainDelta0 = _mm_loadu_ps(&pRunningGainDelta[0]);
pFramesOutF32[iFrame*8 + 4] = pFramesInF32[iFrame*8 + 4] * pRunningGain[4]; __m128 runningGainDelta1 = _mm_loadu_ps(&pRunningGainDelta[4]);
pFramesOutF32[iFrame*8 + 5] = pFramesInF32[iFrame*8 + 5] * pRunningGain[5];
pFramesOutF32[iFrame*8 + 6] = pFramesInF32[iFrame*8 + 6] * pRunningGain[6]; for (; iFrame < interpolatedFrameCount; iFrame += 1) {
pFramesOutF32[iFrame*8 + 7] = pFramesInF32[iFrame*8 + 7] * pRunningGain[7]; _mm_storeu_ps(&pFramesOutF32[iFrame*8 + 0], _mm_mul_ps(_mm_loadu_ps(&pFramesInF32[iFrame*8 + 0]), runningGain0));
_mm_storeu_ps(&pFramesOutF32[iFrame*8 + 4], _mm_mul_ps(_mm_loadu_ps(&pFramesInF32[iFrame*8 + 4]), runningGain1));
runningGain0 = _mm_add_ps(runningGain0, runningGainDelta0);
runningGain1 = _mm_add_ps(runningGain1, runningGainDelta1);
}
}
else
#endif
{
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
/* This temp buffer is required to allow Clang to generate efficient auto-vectorized code. */
float temp[8];
for (iChannel = 0; iChannel < 8; iChannel += 1) {
temp[iChannel] = pFramesInF32[iFrame*8 + iChannel];
}
/* Move the running gain forward towards the new gain. */ for (iChannel = 0; iChannel < 8; iChannel += 1) {
pRunningGain[0] += pRunningGainDelta[0]; pFramesOutF32[iFrame*8 + iChannel] = temp[iChannel] * pRunningGain[iChannel];
pRunningGain[1] += pRunningGainDelta[1]; }
pRunningGain[2] += pRunningGainDelta[2];
pRunningGain[3] += pRunningGainDelta[3]; /* Move the running gain forward towards the new gain. */
pRunningGain[4] += pRunningGainDelta[4]; for (iChannel = 0; iChannel < 8; iChannel += 1) {
pRunningGain[5] += pRunningGainDelta[5]; pRunningGain[iChannel] += pRunningGainDelta[iChannel];
pRunningGain[6] += pRunningGainDelta[6]; }
pRunningGain[7] += pRunningGainDelta[7]; }
} }
} }
...@@ -52165,15 +52183,14 @@ static void ma_channel_map_apply_f32(float* pFramesOut, const ma_channel* pChann ...@@ -52165,15 +52183,14 @@ static void ma_channel_map_apply_f32(float* pFramesOut, const ma_channel* pChann
accumulation[6] += pFramesIn[iFrame*2 + 1] * weights[6][1]; accumulation[6] += pFramesIn[iFrame*2 + 1] * weights[6][1];
accumulation[7] += pFramesIn[iFrame*2 + 1] * weights[7][1]; accumulation[7] += pFramesIn[iFrame*2 + 1] * weights[7][1];
pFramesOut[iFrame*8 + 0] = accumulation[0];
pFramesOut[iFrame * 8 + 0] = accumulation[0]; pFramesOut[iFrame*8 + 1] = accumulation[1];
pFramesOut[iFrame * 8 + 1] = accumulation[1]; pFramesOut[iFrame*8 + 2] = accumulation[2];
pFramesOut[iFrame * 8 + 2] = accumulation[2]; pFramesOut[iFrame*8 + 3] = accumulation[3];
pFramesOut[iFrame * 8 + 3] = accumulation[3]; pFramesOut[iFrame*8 + 4] = accumulation[4];
pFramesOut[iFrame * 8 + 4] = accumulation[4]; pFramesOut[iFrame*8 + 5] = accumulation[5];
pFramesOut[iFrame * 8 + 5] = accumulation[5]; pFramesOut[iFrame*8 + 6] = accumulation[6];
pFramesOut[iFrame * 8 + 6] = accumulation[6]; pFramesOut[iFrame*8 + 7] = accumulation[7];
pFramesOut[iFrame * 8 + 7] = accumulation[7];
} }
} else { } else {
/* When outputting to 8 channels, we can do everything in groups of two 4x SIMD operations. */ /* When outputting to 8 channels, we can do everything in groups of two 4x SIMD operations. */
...@@ -52191,16 +52208,40 @@ static void ma_channel_map_apply_f32(float* pFramesOut, const ma_channel* pChann ...@@ -52191,16 +52208,40 @@ static void ma_channel_map_apply_f32(float* pFramesOut, const ma_channel* pChann
accumulation[7] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[7][iChannelIn]; accumulation[7] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[7][iChannelIn];
} }
pFramesOut[iFrame * 8 + 0] = accumulation[0]; pFramesOut[iFrame*8 + 0] = accumulation[0];
pFramesOut[iFrame * 8 + 1] = accumulation[1]; pFramesOut[iFrame*8 + 1] = accumulation[1];
pFramesOut[iFrame * 8 + 2] = accumulation[2]; pFramesOut[iFrame*8 + 2] = accumulation[2];
pFramesOut[iFrame * 8 + 3] = accumulation[3]; pFramesOut[iFrame*8 + 3] = accumulation[3];
pFramesOut[iFrame * 8 + 4] = accumulation[4]; pFramesOut[iFrame*8 + 4] = accumulation[4];
pFramesOut[iFrame * 8 + 5] = accumulation[5]; pFramesOut[iFrame*8 + 5] = accumulation[5];
pFramesOut[iFrame * 8 + 6] = accumulation[6]; pFramesOut[iFrame*8 + 6] = accumulation[6];
pFramesOut[iFrame * 8 + 7] = accumulation[7]; pFramesOut[iFrame*8 + 7] = accumulation[7];
} }
} }
} else if (channelsOut == 6) {
/*
When outputting to 6 channels we unfortunately don't have a nice multiple of 4 to do 4x SIMD operations. Instead we'll
expand our weights and do two frames at a time.
*/
for (; iFrame < frameCount; iFrame += 1) {
float accumulation[12] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
for (iChannelIn = 0; iChannelIn < channelsIn; iChannelIn += 1) {
accumulation[0] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[0][iChannelIn];
accumulation[1] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[1][iChannelIn];
accumulation[2] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[2][iChannelIn];
accumulation[3] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[3][iChannelIn];
accumulation[4] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[4][iChannelIn];
accumulation[5] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[5][iChannelIn];
}
pFramesOut[iFrame*6 + 0] = accumulation[0];
pFramesOut[iFrame*6 + 1] = accumulation[1];
pFramesOut[iFrame*6 + 2] = accumulation[2];
pFramesOut[iFrame*6 + 3] = accumulation[3];
pFramesOut[iFrame*6 + 4] = accumulation[4];
pFramesOut[iFrame*6 + 5] = accumulation[5];
}
} }
/* Leftover frames. */ /* Leftover frames. */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment