Commit 4c4fe083 authored by David Reid's avatar David Reid

Early experimental SIMD work.

parent f89296d7
...@@ -47,6 +47,8 @@ ...@@ -47,6 +47,8 @@
// the development packages for any particular backend you can disable it by #define-ing the appropriate MAL_NO_* // the development packages for any particular backend you can disable it by #define-ing the appropriate MAL_NO_*
// option before the implementation. // option before the implementation.
// //
// Note that GCC and Clang requires "-msse2", "-mavx", etc. for SIMD optimizations.
//
// //
// Building for Windows // Building for Windows
// -------------------- // --------------------
...@@ -55,7 +57,7 @@ ...@@ -55,7 +57,7 @@
// //
// Building for Linux // Building for Linux
// ------------------ // ------------------
// The Linux build only requires linking to -ldl and -lpthread. You do not need any development packages for any // The Linux build only requires linking to -ldl, -lpthread and -lm. You do not need any development packages for any
// of the supported backends. // of the supported backends.
// //
// Building for BSD // Building for BSD
...@@ -71,8 +73,7 @@ ...@@ -71,8 +73,7 @@
// Building for Emscripten // Building for Emscripten
// ----------------------- // -----------------------
// The Emscripten build currently uses SDL 1.2 for it's backend which means specifying "-s USE_SDL=2" is unecessary // The Emscripten build currently uses SDL 1.2 for it's backend which means specifying "-s USE_SDL=2" is unecessary
// as of this version. However, if in the future there is legitimate benefit or enough demand for SDL 2 to be used // as of this version.
// instead, you will need to specify this when compiling.
// //
// //
// Playback Example // Playback Example
...@@ -200,7 +201,19 @@ ...@@ -200,7 +201,19 @@
// Disables the decoding APIs. // Disables the decoding APIs.
// //
// #define MAL_NO_STDIO // #define MAL_NO_STDIO
// Disables file IO APIs // Disables file IO APIs.
//
// #define MAL_NO_SSE2
// Disables SSE2 optimizations.
//
// #define MAL_NO_AVX
// Disables AVX optimizations.
//
// #define MAL_NO_AVX512
// Disables AVX-512 optimizations.
//
// #define MAL_NO_NEON
// Disables NEON optimizations.
#ifndef mini_al_h #ifndef mini_al_h
#define mini_al_h #define mini_al_h
...@@ -791,6 +804,10 @@ typedef struct ...@@ -791,6 +804,10 @@ typedef struct
mal_channel channelMapIn[MAL_MAX_CHANNELS]; mal_channel channelMapIn[MAL_MAX_CHANNELS];
mal_channel channelMapOut[MAL_MAX_CHANNELS]; mal_channel channelMapOut[MAL_MAX_CHANNELS];
mal_channel_mix_mode mixingMode; mal_channel_mix_mode mixingMode;
mal_bool32 noSSE2 : 1;
mal_bool32 noAVX : 1;
mal_bool32 noAVX512 : 1;
mal_bool32 noNEON : 1;
mal_channel_router_read_deinterleaved_proc onReadDeinterleaved; mal_channel_router_read_deinterleaved_proc onReadDeinterleaved;
void* pUserData; void* pUserData;
} mal_channel_router_config; } mal_channel_router_config;
...@@ -800,6 +817,10 @@ struct mal_channel_router ...@@ -800,6 +817,10 @@ struct mal_channel_router
mal_channel_router_config config; mal_channel_router_config config;
mal_bool32 isPassthrough : 1; mal_bool32 isPassthrough : 1;
mal_bool32 isSimpleShuffle : 1; mal_bool32 isSimpleShuffle : 1;
mal_bool32 useSSE2 : 1;
mal_bool32 useAVX : 1;
mal_bool32 useAVX512 : 1;
mal_bool32 useNEON : 1;
mal_uint8 shuffleTable[MAL_MAX_CHANNELS]; mal_uint8 shuffleTable[MAL_MAX_CHANNELS];
float weights[MAL_MAX_CHANNELS][MAL_MAX_CHANNELS]; float weights[MAL_MAX_CHANNELS][MAL_MAX_CHANNELS];
}; };
...@@ -2291,6 +2312,241 @@ mal_uint64 mal_sine_wave_read(mal_sine_wave* pSignWave, mal_uint64 count, float* ...@@ -2291,6 +2312,241 @@ mal_uint64 mal_sine_wave_read(mal_sine_wave* pSignWave, mal_uint64 count, float*
#endif #endif
#endif #endif
// Architecture Detection
#if defined(__x86_64__) || defined(_M_X64)
#define MAL_X64
#elif defined(__i386) || defined(_M_IX86)
#define MAL_X86
#elif defined(__arm__) || defined(_M_ARM)
#define MAL_ARM
#endif
// Intrinsics Support
#if defined(MAL_X64) || defined(MAL_X86)
#if defined(_MSC_VER)
// MSVC.
#if !defined(MAL_NO_SSE2) // Assume all MSVC compilers support SSE2 intrinsics.
#define MAL_SUPPORT_SSE2
#endif
#if _MSC_VER >= 1600 && !defined(MAL_NO_AVX) // 2010
#define MAL_SUPPORT_AVX
#endif
#if _MSC_VER >= 1910 && !defined(MAL_NO_AVX512) // 2017
#define MAL_SUPPORT_AVX512
#endif
#else
// Assume GNUC-style.
#if defined(__SSE2__) && !defined(MAL_NO_SSE2)
#define MAL_SUPPORT_SSE2
#endif
#if defined(__AVX__) && !defined(MAL_NO_AVX)
#define MAL_SUPPORT_AVX
#endif
#if defined(__AVX512F__) && !defined(MAL_NO_AVX512)
#define MAL_SUPPORT_AVX512
#endif
#endif
// If at this point we still haven't determined compiler support for the intrinsics just fall back to __has_include.
#if !defined(__GNUC__) && defined(__has_include)
#if !defined(MAL_SUPPORT_SSE2) && !defined(MAL_NO_SSE2) && __has_include(<emmintrin.h>)
#define MAL_SUPPORT_SSE2
#endif
#if !defined(MAL_SUPPORT_AVX) && !defined(MAL_NO_AVX) && __has_include(<immintrin.h>)
#define MAL_SUPPORT_AVX
#endif
#if !defined(MAL_SUPPORT_AVX512) && !defined(MAL_NO_AVX512) && __has_include(<zmmintrin.h>)
#define MAL_SUPPORT_AVX512
#endif
#endif
#if defined(MAL_SUPPORT_AVX512)
#include <immintrin.h> // Not a mistake. Intentionally including <immintrin.h> instead of <zmmintrin.h> because otherwise the compiler will complain.
#elif defined(MAL_SUPPORT_AVX)
#include <immintrin.h>
#elif defined(MAL_SUPPORT_SSE2)
#include <emmintrin.h>
#endif
#endif
#if defined(MAL_ARM)
#if !defined(MAL_NO_NEON) && (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64))
#define MAL_SUPPORT_NEON
#endif
// Fall back to looking for the #include file.
#if !defined(__GNUC__) && defined(__has_include)
#if !defined(MAL_SUPPORT_NEON) && !defined(MAL_NO_NEON) && __has_include(<arm_neon.h>)
#define MAL_SUPPORT_NEON
#endif
#endif
#if defined(MAL_SUPPORT_NEON)
#include <arm_neon.h>
#endif
#endif
#if defined(MAL_X64) || defined(MAL_X86)
#if defined(_MSC_VER)
#if _MSC_VER >= 1400
#include <intrin.h>
static MAL_INLINE void mal_cpuid(int info[4], int fid)
{
__cpuid(info, fid);
}
#else
#define MAL_NO_CPUID
#endif
#if _MSC_VER >= 1600
static MAL_INLINE unsigned __int64 mal_xgetbv(int reg)
{
return _xgetbv(reg);
}
#else
#define MAL_NO_XGETBV
#endif
#elif defined(__GNUC__) || defined(__clang__)
static MAL_INLINE void mal_cpuid(int info[4], int fid)
{
asm (
"movl %[fid], %%eax\n\t"
"cpuid\n\t"
"movl %%eax, %[info0]\n\t"
"movl %%ebx, %[info1]\n\t"
"movl %%ecx, %[info2]\n\t"
"movl %%edx, %[info3]\n\t"
: [info0] "=rm"(info[0]),
[info1] "=rm"(info[1]),
[info2] "=rm"(info[2]),
[info3] "=rm"(info[3])
: [fid] "rm"(fid)
: "eax", "ebx", "ecx", "edx"
);
}
static MAL_INLINE unsigned long long mal_xgetbv(int reg)
{
unsigned int hi;
unsigned int lo;
asm (
"movl %[reg], %%ecx\n\t"
"xgetbv\n\t"
"movl %%eax, %[lo]\n\t"
"movl %%edx, %[hi]\n\t"
: [lo] "=rm"(lo),
[hi] "=rm"(hi)
: [reg] "rm"(reg)
: "eax", "ecx", "edx"
);
return ((unsigned long long)hi << 32ULL) | (unsigned long long)lo;
}
#else
#define MAL_NO_CPUID
#define MAL_NO_XGETBV
#endif
#else
#define MAL_NO_CPUID
#define MAL_NO_XGETBV
#endif
static MAL_INLINE mal_bool32 mal_has_sse2()
{
#if (defined(MAL_X64) || defined(MAL_X86)) && !defined(MAL_NO_SSE2)
#if defined(MAL_X64)
return MAL_TRUE; // 64-bit targets always support SSE2.
#elif (defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(__SSE2__)
return MAL_TRUE; // If the compiler is allowed to freely generate SSE2 code we can assume support.
#else
#if defined(MAL_NO_CPUID)
return MAL_FALSE;
#else
int info[4];
mal_cpuid(info, 1);
return (info[3] & (1 << 26)) != 0;
#endif
#endif
#else
return MAL_FALSE; // SSE2 is only supported on x86 and x64 architectures.
#endif
}
static MAL_INLINE mal_bool32 mal_has_avx()
{
#if (defined(MAL_X64) || defined(MAL_X86)) && !defined(MAL_NO_AVX)
#if defined(_AVX_) || defined(__AVX__)
return MAL_TRUE; // If the compiler is allowed to freely generate AVX code we can assume support.
#else
// AVX requires both CPU and OS support.
#if defined(MAL_NO_CPUID) || defined(MAL_NO_XGETBV)
return MAL_FALSE;
#else
int info[4];
mal_cpuid(info, 1);
if (((info[2] & (1 << 27)) != 0) && ((info[2] & (1 << 28)) != 0)) {
mal_uint64 xrc = mal_xgetbv(0);
if ((xrc & 0x06) == 0x06) {
return MAL_TRUE;
} else {
return MAL_FALSE;
}
} else {
return MAL_FALSE;
}
#endif
#endif
#else
return MAL_FALSE; // AVX is only supported on x86 and x64 architectures.
#endif
}
static MAL_INLINE mal_bool32 mal_has_avx512f()
{
#if (defined(MAL_X64) || defined(MAL_X86)) && !defined(MAL_NO_AVX512)
#if defined(__AVX512F__)
return MAL_TRUE; // If the compiler is allowed to freely generate AVX-512F code we can assume support.
#else
// AVX-512 requires both CPU and OS support.
#if defined(MAL_NO_CPUID) || defined(MAL_NO_XGETBV)
return MAL_FALSE;
#else
int info[4];
mal_cpuid(info, 1);
if (((info[2] & (1 << 27)) != 0) && ((info[1] & (1 << 16)) != 0)) {
mal_uint64 xrc = mal_xgetbv(0);
if ((xrc & 0xE6) == 0xE6) {
return MAL_TRUE;
} else {
return MAL_FALSE;
}
} else {
return MAL_FALSE;
}
#endif
#endif
#else
return MAL_FALSE; // AVX-512F is only supported on x86 and x64 architectures.
#endif
}
static MAL_INLINE mal_bool32 mal_has_neon()
{
#if defined(MAL_ARM) && !defined(MAL_NO_NEON)
#if (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64))
return MAL_TRUE; // If the compiler is allowed to freely generate NEON code we can assume support.
#else
// TODO: Runtime check.
return MAL_FALSE;
#endif
#else
return MAL_FALSE; // NEON is only supported on ARM architectures.
#endif
}
#ifndef MAL_PI #ifndef MAL_PI
#define MAL_PI 3.14159265358979323846264f #define MAL_PI 3.14159265358979323846264f
#endif #endif
...@@ -2300,9 +2556,9 @@ mal_uint64 mal_sine_wave_read(mal_sine_wave* pSignWave, mal_uint64 count, float* ...@@ -2300,9 +2556,9 @@ mal_uint64 mal_sine_wave_read(mal_sine_wave* pSignWave, mal_uint64 count, float*
// Unfortunately using runtime linking for pthreads causes problems. This has occurred for me when testing on FreeBSD. When // Unfortunately using runtime linking for pthreads causes problems. This has occurred for me when testing on FreeBSD. When
// using runtime linking, deadlocks can occur (for me it happens when loading data from fread()). It turns out that doing // using runtime linking, deadlocks can occur (for me it happens when loading data from fread()). It turns out that doing
// compile-time linking fixes this. I'm not sure why this happens, but this is the safest way I can think of to continue. To // compile-time linking fixes this. I'm not sure why this happens, but the safest way I can think of to fix this is to simply
// enable runtime linking, #define this before the implementation of this file. I am not officially supporting this, but I'm // disable runtime linking by default. To enable runtime linking, #define this before the implementation of this file. I am
// leaving it here in case it's useful for somebody, somewhere. // not officially supporting this, but I'm leaving it here in case it's useful for somebody, somewhere.
//#define MAL_USE_RUNTIME_LINKING_FOR_PTHREAD //#define MAL_USE_RUNTIME_LINKING_FOR_PTHREAD
// Disable run-time linking on certain backends. // Disable run-time linking on certain backends.
...@@ -15601,13 +15857,53 @@ mal_bool32 mal_channel_map_contains_channel_position(mal_uint32 channels, const ...@@ -15601,13 +15857,53 @@ mal_bool32 mal_channel_map_contains_channel_position(mal_uint32 channels, const
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//#define MAL_USE_REFERENCE_CONVERSION_APIS 1 //#define MAL_USE_REFERENCE_CONVERSION_APIS 1
#define MAL_USE_SSE //#define MAL_USE_SSE
void mal_copy_memory_64(void* dst, const void* src, mal_uint64 sizeInBytes)
{
#if 0xFFFFFFFFFFFFFFFF <= SIZE_MAX
mal_copy_memory(dst, src, (size_t)sizeInBytes);
#else
while (sizeInBytes > 0) {
mal_uint64 bytesToCopyNow = sizeInBytes;
if (bytesToCopyNow > SIZE_MAX) {
bytesToCopyNow = SIZE_MAX;
}
mal_copy_memory(dst, src, (size_t)bytesToCopyNow); // Safe cast to size_t.
sizeInBytes -= bytesToCopyNow;
dst = ( void*)(( mal_uint8*)dst + bytesToCopyNow);
src = (const void*)((const mal_uint8*)src + bytesToCopyNow);
}
#endif
}
void mal_zero_memory_64(void* dst, mal_uint64 sizeInBytes)
{
#if 0xFFFFFFFFFFFFFFFF <= SIZE_MAX
mal_zero_memory(dst, (size_t)sizeInBytes);
#else
while (sizeInBytes > 0) {
mal_uint64 bytesToZeroNow = sizeInBytes;
if (bytesToZeroNow > SIZE_MAX) {
bytesToZeroNow = SIZE_MAX;
}
mal_zero_memory(dst, (size_t)bytesToZeroNow); // Safe cast to size_t.
sizeInBytes -= bytesToZeroNow;
dst = (void*)((mal_uint8*)dst + bytesToZeroNow);
}
#endif
}
// u8 // u8
void mal_pcm_u8_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) void mal_pcm_u8_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{ {
(void)ditherMode; (void)ditherMode;
mal_copy_memory(dst, src, count * sizeof(mal_uint8)); mal_copy_memory_64(dst, src, count * sizeof(mal_uint8));
} }
...@@ -15803,7 +16099,7 @@ void mal_pcm_interleave_u8__optimized(void* dst, const void** src, mal_uint64 fr ...@@ -15803,7 +16099,7 @@ void mal_pcm_interleave_u8__optimized(void* dst, const void** src, mal_uint64 fr
const mal_uint8** src_u8 = (const mal_uint8**)src; const mal_uint8** src_u8 = (const mal_uint8**)src;
if (channels == 1) { if (channels == 1) {
mal_copy_memory(dst, src[0], frameCount * sizeof(mal_uint8)); mal_copy_memory_64(dst, src[0], frameCount * sizeof(mal_uint8));
} else if (channels == 2) { } else if (channels == 2) {
mal_uint64 iFrame; mal_uint64 iFrame;
for (iFrame = 0; iFrame < frameCount; iFrame += 1) { for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
...@@ -15906,7 +16202,7 @@ void mal_pcm_s16_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_ ...@@ -15906,7 +16202,7 @@ void mal_pcm_s16_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_
void mal_pcm_s16_to_s16(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) void mal_pcm_s16_to_s16(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{ {
(void)ditherMode; (void)ditherMode;
mal_copy_memory(dst, src, count * sizeof(mal_int16)); mal_copy_memory_64(dst, src, count * sizeof(mal_int16));
} }
...@@ -16185,7 +16481,7 @@ void mal_pcm_s24_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither ...@@ -16185,7 +16481,7 @@ void mal_pcm_s24_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither
{ {
(void)ditherMode; (void)ditherMode;
mal_copy_memory(dst, src, count * 3); mal_copy_memory_64(dst, src, count * 3);
} }
...@@ -16472,7 +16768,7 @@ void mal_pcm_s32_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither ...@@ -16472,7 +16768,7 @@ void mal_pcm_s32_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither
{ {
(void)ditherMode; (void)ditherMode;
mal_copy_memory(dst, src, count * sizeof(mal_int32)); mal_copy_memory_64(dst, src, count * sizeof(mal_int32));
} }
...@@ -16791,7 +17087,7 @@ void mal_pcm_f32_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither ...@@ -16791,7 +17087,7 @@ void mal_pcm_f32_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither
{ {
(void)ditherMode; (void)ditherMode;
mal_copy_memory(dst, src, count * sizeof(float)); mal_copy_memory_64(dst, src, count * sizeof(float));
} }
...@@ -17214,6 +17510,40 @@ mal_uint64 mal_format_converter_read_deinterleaved(mal_format_converter* pConver ...@@ -17214,6 +17510,40 @@ mal_uint64 mal_format_converter_read_deinterleaved(mal_format_converter* pConver
// //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Splits a buffer into parts of equal length and of the given alignment. The returned size of the split buffers will be a
// multiple of the alignment. The alignment must be a power of 2.
void mal_split_buffer(void* pBuffer, size_t bufferSize, size_t splitCount, size_t alignment, void** ppBuffersOut, size_t* pSplitSizeOut)
{
if (pBuffer == NULL || bufferSize == 0 || splitCount == 0) {
return;
}
if (alignment == 0) {
alignment = 1;
}
mal_uintptr pBufferUnaligned = (mal_uintptr)pBuffer;
mal_uintptr pBufferAligned = (pBufferUnaligned + (alignment-1)) & ~(alignment-1);
size_t unalignedBytes = (size_t)(pBufferAligned - pBufferUnaligned);
size_t splitSize = 0;
if (bufferSize >= unalignedBytes) {
splitSize = (bufferSize - unalignedBytes) / splitCount;
splitSize = splitSize & ~(alignment-1);
}
if (ppBuffersOut != NULL) {
for (size_t i = 0; i < splitCount; ++i) {
ppBuffersOut[i] = (mal_uint8*)(pBufferAligned + (splitSize*i));
}
}
if (pSplitSizeOut) {
*pSplitSizeOut = splitSize;
}
}
// -X = Left, +X = Right // -X = Left, +X = Right
// -Y = Bottom, +Y = Top // -Y = Bottom, +Y = Top
// -Z = Front, +Z = Back // -Z = Front, +Z = Back
...@@ -17456,6 +17786,12 @@ mal_result mal_channel_router_init(const mal_channel_router_config* pConfig, mal ...@@ -17456,6 +17786,12 @@ mal_result mal_channel_router_init(const mal_channel_router_config* pConfig, mal
pRouter->config = *pConfig; pRouter->config = *pConfig;
// SIMD
pRouter->useSSE2 = mal_has_sse2() && !pConfig->noSSE2;
pRouter->useAVX = mal_has_avx() && !pConfig->noAVX;
pRouter->useAVX512 = mal_has_avx512f() && !pConfig->noAVX512;
pRouter->useNEON = mal_has_neon() && !pConfig->noNEON;
// If the input and output channels and channel maps are the same we should use a passthrough. // If the input and output channels and channel maps are the same we should use a passthrough.
if (pRouter->config.channelsIn == pRouter->config.channelsOut) { if (pRouter->config.channelsIn == pRouter->config.channelsOut) {
if (mal_channel_map_equal(pRouter->config.channelsIn, pRouter->config.channelMapIn, pRouter->config.channelMapOut)) { if (mal_channel_map_equal(pRouter->config.channelsIn, pRouter->config.channelMapIn, pRouter->config.channelMapOut)) {
...@@ -17631,6 +17967,26 @@ mal_result mal_channel_router_init(const mal_channel_router_config* pConfig, mal ...@@ -17631,6 +17967,26 @@ mal_result mal_channel_router_init(const mal_channel_router_config* pConfig, mal
return MAL_SUCCESS; return MAL_SUCCESS;
} }
static MAL_INLINE mal_bool32 mal_channel_router__can_use_sse2(mal_channel_router* pRouter, const float* pSamplesOut, const float* pSamplesIn)
{
return pRouter->useSSE2 && (((mal_uintptr)pSamplesOut & 15) == 0) && (((mal_uintptr)pSamplesIn & 15) == 0);
}
static MAL_INLINE mal_bool32 mal_channel_router__can_use_avx(mal_channel_router* pRouter, const float* pSamplesOut, const float* pSamplesIn)
{
return pRouter->useAVX && (((mal_uintptr)pSamplesOut & 31) == 0) && (((mal_uintptr)pSamplesIn & 31) == 0);
}
static MAL_INLINE mal_bool32 mal_channel_router__can_use_avx512(mal_channel_router* pRouter, const float* pSamplesOut, const float* pSamplesIn)
{
return pRouter->useAVX512 && (((mal_uintptr)pSamplesOut & 63) == 0) && (((mal_uintptr)pSamplesIn & 63) == 0);
}
static MAL_INLINE mal_bool32 mal_channel_router__can_use_neon(mal_channel_router* pRouter, const float* pSamplesOut, const float* pSamplesIn)
{
return pRouter->useNEON && (((mal_uintptr)pSamplesOut & 15) == 0) && (((mal_uintptr)pSamplesIn & 15) == 0);
}
void mal_channel_router__do_routing(mal_channel_router* pRouter, mal_uint64 frameCount, float** ppSamplesOut, const float** ppSamplesIn) void mal_channel_router__do_routing(mal_channel_router* pRouter, mal_uint64 frameCount, float** ppSamplesOut, const float** ppSamplesIn)
{ {
mal_assert(pRouter != NULL); mal_assert(pRouter != NULL);
...@@ -17641,20 +17997,83 @@ void mal_channel_router__do_routing(mal_channel_router* pRouter, mal_uint64 fram ...@@ -17641,20 +17997,83 @@ void mal_channel_router__do_routing(mal_channel_router* pRouter, mal_uint64 fram
mal_assert(pRouter->config.channelsIn == pRouter->config.channelsOut); mal_assert(pRouter->config.channelsIn == pRouter->config.channelsOut);
for (mal_uint32 iChannelIn = 0; iChannelIn < pRouter->config.channelsIn; ++iChannelIn) { for (mal_uint32 iChannelIn = 0; iChannelIn < pRouter->config.channelsIn; ++iChannelIn) {
mal_uint32 iChannelOut = pRouter->shuffleTable[iChannelIn]; mal_uint32 iChannelOut = pRouter->shuffleTable[iChannelIn];
mal_copy_memory(ppSamplesOut[iChannelOut], ppSamplesIn[iChannelIn], frameCount * sizeof(float)); mal_copy_memory_64(ppSamplesOut[iChannelOut], ppSamplesIn[iChannelIn], frameCount * sizeof(float));
} }
} else { } else {
// This is the more complicated case. Each of the output channels is accumulated with 0 or more input channels. // This is the more complicated case. Each of the output channels is accumulated with 0 or more input channels.
// Clear. // Clear.
for (mal_uint32 iChannelOut = 0; iChannelOut < pRouter->config.channelsOut; ++iChannelOut) { for (mal_uint32 iChannelOut = 0; iChannelOut < pRouter->config.channelsOut; ++iChannelOut) {
mal_zero_memory(ppSamplesOut[iChannelOut], frameCount * sizeof(float)); mal_zero_memory_64(ppSamplesOut[iChannelOut], frameCount * sizeof(float));
} }
// Accumulate. // Accumulate.
for (mal_uint32 iChannelIn = 0; iChannelIn < pRouter->config.channelsIn; ++iChannelIn) { for (mal_uint32 iChannelIn = 0; iChannelIn < pRouter->config.channelsIn; ++iChannelIn) {
for (mal_uint32 iChannelOut = 0; iChannelOut < pRouter->config.channelsOut; ++iChannelOut) { for (mal_uint32 iChannelOut = 0; iChannelOut < pRouter->config.channelsOut; ++iChannelOut) {
for (mal_uint64 iFrame = 0; iFrame < frameCount; ++iFrame) { mal_uint64 iFrame = 0;
#if defined(MAL_SUPPORT_AVX512)
if (mal_channel_router__can_use_avx512(pRouter, ppSamplesOut[iChannelOut], ppSamplesIn[iChannelIn])) {
__m512 weight = _mm512_set1_ps(pRouter->weights[iChannelIn][iChannelOut]);
mal_uint64 frameCount16 = frameCount/16;
for (mal_uint64 iFrame16 = 0; iFrame16 < frameCount16; iFrame16 += 1) {
__m512* pO = (__m512*)ppSamplesOut[iChannelOut] + iFrame16;
__m512* pI = (__m512*)ppSamplesIn [iChannelIn ] + iFrame16;
*pO = _mm512_add_ps(*pO, _mm512_mul_ps(*pI, weight));
}
iFrame += frameCount16*16;
}
else
#endif
#if defined(MAL_SUPPORT_AVX)
if (mal_channel_router__can_use_avx(pRouter, ppSamplesOut[iChannelOut], ppSamplesIn[iChannelIn])) {
__m256 weight = _mm256_set1_ps(pRouter->weights[iChannelIn][iChannelOut]);
mal_uint64 frameCount8 = frameCount/8;
for (mal_uint64 iFrame8 = 0; iFrame8 < frameCount8; iFrame8 += 1) {
__m256* pO = (__m256*)ppSamplesOut[iChannelOut] + iFrame8;
__m256* pI = (__m256*)ppSamplesIn [iChannelIn ] + iFrame8;
*pO = _mm256_add_ps(*pO, _mm256_mul_ps(*pI, weight));
}
iFrame += frameCount8*8;
}
else
#endif
#if defined(MAL_SUPPORT_SSE2)
if (mal_channel_router__can_use_sse2(pRouter, ppSamplesOut[iChannelOut], ppSamplesIn[iChannelIn])) {
__m128 weight = _mm_set1_ps(pRouter->weights[iChannelIn][iChannelOut]);
mal_uint64 frameCount4 = frameCount/4;
for (mal_uint64 iFrame4 = 0; iFrame4 < frameCount4; iFrame4 += 1) {
__m128* pO = (__m128*)ppSamplesOut[iChannelOut] + iFrame4;
__m128* pI = (__m128*)ppSamplesIn [iChannelIn ] + iFrame4;
*pO = _mm_add_ps(*pO, _mm_mul_ps(*pI, weight));
}
iFrame += frameCount4*4;
} else
#endif
{ // Reference.
float weight0 = pRouter->weights[iChannelIn][iChannelOut];
float weight1 = pRouter->weights[iChannelIn][iChannelOut];
float weight2 = pRouter->weights[iChannelIn][iChannelOut];
float weight3 = pRouter->weights[iChannelIn][iChannelOut];
mal_uint64 frameCount4 = frameCount/4;
for (mal_uint64 iFrame4 = 0; iFrame4 < frameCount4; iFrame4 += 1) {
ppSamplesOut[iChannelOut][iFrame+0] += ppSamplesIn[iChannelIn][iFrame+0] * weight0;
ppSamplesOut[iChannelOut][iFrame+1] += ppSamplesIn[iChannelIn][iFrame+1] * weight1;
ppSamplesOut[iChannelOut][iFrame+2] += ppSamplesIn[iChannelIn][iFrame+2] * weight2;
ppSamplesOut[iChannelOut][iFrame+3] += ppSamplesIn[iChannelIn][iFrame+3] * weight3;
iFrame += 4;
}
}
// Leftover.
for (; iFrame < frameCount; ++iFrame) {
ppSamplesOut[iChannelOut][iFrame] += ppSamplesIn[iChannelIn][iFrame] * pRouter->weights[iChannelIn][iChannelOut]; ppSamplesOut[iChannelOut][iFrame] += ppSamplesIn[iChannelIn][iFrame] * pRouter->weights[iChannelIn][iChannelOut];
} }
} }
...@@ -17701,15 +18120,14 @@ mal_uint64 mal_channel_router_read_deinterleaved(mal_channel_router* pRouter, ma ...@@ -17701,15 +18120,14 @@ mal_uint64 mal_channel_router_read_deinterleaved(mal_channel_router* pRouter, ma
float* ppNextSamplesOut[MAL_MAX_CHANNELS]; float* ppNextSamplesOut[MAL_MAX_CHANNELS];
mal_copy_memory(ppNextSamplesOut, ppSamplesOut, sizeof(float*) * pRouter->config.channelsOut); mal_copy_memory(ppNextSamplesOut, ppSamplesOut, sizeof(float*) * pRouter->config.channelsOut);
float temp[MAL_MAX_CHANNELS * 256]; MAL_ALIGN(MAL_SIMD_ALIGNMENT) float temp[MAL_MAX_CHANNELS * 256];
mal_assert(sizeof(temp) <= 0xFFFFFFFF); mal_assert(sizeof(temp) <= 0xFFFFFFFF);
mal_uint32 maxFramesToReadEachIteration = mal_countof(temp) / pRouter->config.channelsIn;
float* ppTemp[MAL_MAX_CHANNELS]; float* ppTemp[MAL_MAX_CHANNELS];
for (mal_uint32 iChannel = 0; iChannel < pRouter->config.channelsIn; iChannel += 1) { size_t maxBytesToReadPerFrameEachIteration;
ppTemp[iChannel] = temp + (maxFramesToReadEachIteration*iChannel); mal_split_buffer(temp, sizeof(temp), pRouter->config.channelsIn, MAL_SIMD_ALIGNMENT, (void**)&ppTemp, &maxBytesToReadPerFrameEachIteration);
}
size_t maxFramesToReadEachIteration = maxBytesToReadPerFrameEachIteration/sizeof(float);
mal_uint64 totalFramesRead = 0; mal_uint64 totalFramesRead = 0;
while (totalFramesRead < frameCount) { while (totalFramesRead < frameCount) {
...@@ -18073,7 +18491,7 @@ mal_uint64 mal_src_read_deinterleaved__linear(mal_src* pSRC, mal_uint64 frameCou ...@@ -18073,7 +18491,7 @@ mal_uint64 mal_src_read_deinterleaved__linear(mal_src* pSRC, mal_uint64 frameCou
void mal_pcm_convert(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode) void mal_pcm_convert(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode)
{ {
if (formatOut == formatIn) { if (formatOut == formatIn) {
mal_copy_memory(pOut, pIn, sampleCount * mal_get_bytes_per_sample(formatOut)); mal_copy_memory_64(pOut, pIn, sampleCount * mal_get_bytes_per_sample(formatOut));
return; return;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment