Commit 4c4fe083 authored by David Reid's avatar David Reid

Early experimental SIMD work.

parent f89296d7
......@@ -47,6 +47,8 @@
// the development packages for any particular backend you can disable it by #define-ing the appropriate MAL_NO_*
// option before the implementation.
//
// Note that GCC and Clang requires "-msse2", "-mavx", etc. for SIMD optimizations.
//
//
// Building for Windows
// --------------------
......@@ -55,7 +57,7 @@
//
// Building for Linux
// ------------------
// The Linux build only requires linking to -ldl and -lpthread. You do not need any development packages for any
// The Linux build only requires linking to -ldl, -lpthread and -lm. You do not need any development packages for any
// of the supported backends.
//
// Building for BSD
......@@ -71,8 +73,7 @@
// Building for Emscripten
// -----------------------
// The Emscripten build currently uses SDL 1.2 for it's backend which means specifying "-s USE_SDL=2" is unecessary
// as of this version. However, if in the future there is legitimate benefit or enough demand for SDL 2 to be used
// instead, you will need to specify this when compiling.
// as of this version.
//
//
// Playback Example
......@@ -200,7 +201,19 @@
// Disables the decoding APIs.
//
// #define MAL_NO_STDIO
// Disables file IO APIs
// Disables file IO APIs.
//
// #define MAL_NO_SSE2
// Disables SSE2 optimizations.
//
// #define MAL_NO_AVX
// Disables AVX optimizations.
//
// #define MAL_NO_AVX512
// Disables AVX-512 optimizations.
//
// #define MAL_NO_NEON
// Disables NEON optimizations.
#ifndef mini_al_h
#define mini_al_h
......@@ -791,6 +804,10 @@ typedef struct
mal_channel channelMapIn[MAL_MAX_CHANNELS];
mal_channel channelMapOut[MAL_MAX_CHANNELS];
mal_channel_mix_mode mixingMode;
mal_bool32 noSSE2 : 1;
mal_bool32 noAVX : 1;
mal_bool32 noAVX512 : 1;
mal_bool32 noNEON : 1;
mal_channel_router_read_deinterleaved_proc onReadDeinterleaved;
void* pUserData;
} mal_channel_router_config;
......@@ -800,6 +817,10 @@ struct mal_channel_router
mal_channel_router_config config;
mal_bool32 isPassthrough : 1;
mal_bool32 isSimpleShuffle : 1;
mal_bool32 useSSE2 : 1;
mal_bool32 useAVX : 1;
mal_bool32 useAVX512 : 1;
mal_bool32 useNEON : 1;
mal_uint8 shuffleTable[MAL_MAX_CHANNELS];
float weights[MAL_MAX_CHANNELS][MAL_MAX_CHANNELS];
};
......@@ -2291,6 +2312,241 @@ mal_uint64 mal_sine_wave_read(mal_sine_wave* pSignWave, mal_uint64 count, float*
#endif
#endif
// Architecture Detection
#if defined(__x86_64__) || defined(_M_X64)
#define MAL_X64
#elif defined(__i386) || defined(_M_IX86)
#define MAL_X86
#elif defined(__arm__) || defined(_M_ARM)
#define MAL_ARM
#endif
// Intrinsics Support
#if defined(MAL_X64) || defined(MAL_X86)
#if defined(_MSC_VER)
// MSVC.
#if !defined(MAL_NO_SSE2) // Assume all MSVC compilers support SSE2 intrinsics.
#define MAL_SUPPORT_SSE2
#endif
#if _MSC_VER >= 1600 && !defined(MAL_NO_AVX) // 2010
#define MAL_SUPPORT_AVX
#endif
#if _MSC_VER >= 1910 && !defined(MAL_NO_AVX512) // 2017
#define MAL_SUPPORT_AVX512
#endif
#else
// Assume GNUC-style.
#if defined(__SSE2__) && !defined(MAL_NO_SSE2)
#define MAL_SUPPORT_SSE2
#endif
#if defined(__AVX__) && !defined(MAL_NO_AVX)
#define MAL_SUPPORT_AVX
#endif
#if defined(__AVX512F__) && !defined(MAL_NO_AVX512)
#define MAL_SUPPORT_AVX512
#endif
#endif
// If at this point we still haven't determined compiler support for the intrinsics just fall back to __has_include.
#if !defined(__GNUC__) && defined(__has_include)
#if !defined(MAL_SUPPORT_SSE2) && !defined(MAL_NO_SSE2) && __has_include(<emmintrin.h>)
#define MAL_SUPPORT_SSE2
#endif
#if !defined(MAL_SUPPORT_AVX) && !defined(MAL_NO_AVX) && __has_include(<immintrin.h>)
#define MAL_SUPPORT_AVX
#endif
#if !defined(MAL_SUPPORT_AVX512) && !defined(MAL_NO_AVX512) && __has_include(<zmmintrin.h>)
#define MAL_SUPPORT_AVX512
#endif
#endif
#if defined(MAL_SUPPORT_AVX512)
#include <immintrin.h> // Not a mistake. Intentionally including <immintrin.h> instead of <zmmintrin.h> because otherwise the compiler will complain.
#elif defined(MAL_SUPPORT_AVX)
#include <immintrin.h>
#elif defined(MAL_SUPPORT_SSE2)
#include <emmintrin.h>
#endif
#endif
#if defined(MAL_ARM)
#if !defined(MAL_NO_NEON) && (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64))
#define MAL_SUPPORT_NEON
#endif
// Fall back to looking for the #include file.
#if !defined(__GNUC__) && defined(__has_include)
#if !defined(MAL_SUPPORT_NEON) && !defined(MAL_NO_NEON) && __has_include(<arm_neon.h>)
#define MAL_SUPPORT_NEON
#endif
#endif
#if defined(MAL_SUPPORT_NEON)
#include <arm_neon.h>
#endif
#endif
#if defined(MAL_X64) || defined(MAL_X86)
#if defined(_MSC_VER)
#if _MSC_VER >= 1400
#include <intrin.h>
static MAL_INLINE void mal_cpuid(int info[4], int fid)
{
__cpuid(info, fid);
}
#else
#define MAL_NO_CPUID
#endif
#if _MSC_VER >= 1600
static MAL_INLINE unsigned __int64 mal_xgetbv(int reg)
{
return _xgetbv(reg);
}
#else
#define MAL_NO_XGETBV
#endif
#elif defined(__GNUC__) || defined(__clang__)
static MAL_INLINE void mal_cpuid(int info[4], int fid)
{
asm (
"movl %[fid], %%eax\n\t"
"cpuid\n\t"
"movl %%eax, %[info0]\n\t"
"movl %%ebx, %[info1]\n\t"
"movl %%ecx, %[info2]\n\t"
"movl %%edx, %[info3]\n\t"
: [info0] "=rm"(info[0]),
[info1] "=rm"(info[1]),
[info2] "=rm"(info[2]),
[info3] "=rm"(info[3])
: [fid] "rm"(fid)
: "eax", "ebx", "ecx", "edx"
);
}
static MAL_INLINE unsigned long long mal_xgetbv(int reg)
{
unsigned int hi;
unsigned int lo;
asm (
"movl %[reg], %%ecx\n\t"
"xgetbv\n\t"
"movl %%eax, %[lo]\n\t"
"movl %%edx, %[hi]\n\t"
: [lo] "=rm"(lo),
[hi] "=rm"(hi)
: [reg] "rm"(reg)
: "eax", "ecx", "edx"
);
return ((unsigned long long)hi << 32ULL) | (unsigned long long)lo;
}
#else
#define MAL_NO_CPUID
#define MAL_NO_XGETBV
#endif
#else
#define MAL_NO_CPUID
#define MAL_NO_XGETBV
#endif
static MAL_INLINE mal_bool32 mal_has_sse2()
{
#if (defined(MAL_X64) || defined(MAL_X86)) && !defined(MAL_NO_SSE2)
#if defined(MAL_X64)
return MAL_TRUE; // 64-bit targets always support SSE2.
#elif (defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(__SSE2__)
return MAL_TRUE; // If the compiler is allowed to freely generate SSE2 code we can assume support.
#else
#if defined(MAL_NO_CPUID)
return MAL_FALSE;
#else
int info[4];
mal_cpuid(info, 1);
return (info[3] & (1 << 26)) != 0;
#endif
#endif
#else
return MAL_FALSE; // SSE2 is only supported on x86 and x64 architectures.
#endif
}
static MAL_INLINE mal_bool32 mal_has_avx()
{
#if (defined(MAL_X64) || defined(MAL_X86)) && !defined(MAL_NO_AVX)
#if defined(_AVX_) || defined(__AVX__)
return MAL_TRUE; // If the compiler is allowed to freely generate AVX code we can assume support.
#else
// AVX requires both CPU and OS support.
#if defined(MAL_NO_CPUID) || defined(MAL_NO_XGETBV)
return MAL_FALSE;
#else
int info[4];
mal_cpuid(info, 1);
if (((info[2] & (1 << 27)) != 0) && ((info[2] & (1 << 28)) != 0)) {
mal_uint64 xrc = mal_xgetbv(0);
if ((xrc & 0x06) == 0x06) {
return MAL_TRUE;
} else {
return MAL_FALSE;
}
} else {
return MAL_FALSE;
}
#endif
#endif
#else
return MAL_FALSE; // AVX is only supported on x86 and x64 architectures.
#endif
}
static MAL_INLINE mal_bool32 mal_has_avx512f()
{
#if (defined(MAL_X64) || defined(MAL_X86)) && !defined(MAL_NO_AVX512)
#if defined(__AVX512F__)
return MAL_TRUE; // If the compiler is allowed to freely generate AVX-512F code we can assume support.
#else
// AVX-512 requires both CPU and OS support.
#if defined(MAL_NO_CPUID) || defined(MAL_NO_XGETBV)
return MAL_FALSE;
#else
int info[4];
mal_cpuid(info, 1);
if (((info[2] & (1 << 27)) != 0) && ((info[1] & (1 << 16)) != 0)) {
mal_uint64 xrc = mal_xgetbv(0);
if ((xrc & 0xE6) == 0xE6) {
return MAL_TRUE;
} else {
return MAL_FALSE;
}
} else {
return MAL_FALSE;
}
#endif
#endif
#else
return MAL_FALSE; // AVX-512F is only supported on x86 and x64 architectures.
#endif
}
static MAL_INLINE mal_bool32 mal_has_neon()
{
#if defined(MAL_ARM) && !defined(MAL_NO_NEON)
#if (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64))
return MAL_TRUE; // If the compiler is allowed to freely generate NEON code we can assume support.
#else
// TODO: Runtime check.
return MAL_FALSE;
#endif
#else
return MAL_FALSE; // NEON is only supported on ARM architectures.
#endif
}
#ifndef MAL_PI
#define MAL_PI 3.14159265358979323846264f
#endif
......@@ -2300,9 +2556,9 @@ mal_uint64 mal_sine_wave_read(mal_sine_wave* pSignWave, mal_uint64 count, float*
// Unfortunately using runtime linking for pthreads causes problems. This has occurred for me when testing on FreeBSD. When
// using runtime linking, deadlocks can occur (for me it happens when loading data from fread()). It turns out that doing
// compile-time linking fixes this. I'm not sure why this happens, but this is the safest way I can think of to continue. To
// enable runtime linking, #define this before the implementation of this file. I am not officially supporting this, but I'm
// leaving it here in case it's useful for somebody, somewhere.
// compile-time linking fixes this. I'm not sure why this happens, but the safest way I can think of to fix this is to simply
// disable runtime linking by default. To enable runtime linking, #define this before the implementation of this file. I am
// not officially supporting this, but I'm leaving it here in case it's useful for somebody, somewhere.
//#define MAL_USE_RUNTIME_LINKING_FOR_PTHREAD
// Disable run-time linking on certain backends.
......@@ -15601,13 +15857,53 @@ mal_bool32 mal_channel_map_contains_channel_position(mal_uint32 channels, const
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//#define MAL_USE_REFERENCE_CONVERSION_APIS 1
#define MAL_USE_SSE
//#define MAL_USE_SSE
void mal_copy_memory_64(void* dst, const void* src, mal_uint64 sizeInBytes)
{
#if 0xFFFFFFFFFFFFFFFF <= SIZE_MAX
mal_copy_memory(dst, src, (size_t)sizeInBytes);
#else
while (sizeInBytes > 0) {
mal_uint64 bytesToCopyNow = sizeInBytes;
if (bytesToCopyNow > SIZE_MAX) {
bytesToCopyNow = SIZE_MAX;
}
mal_copy_memory(dst, src, (size_t)bytesToCopyNow); // Safe cast to size_t.
sizeInBytes -= bytesToCopyNow;
dst = ( void*)(( mal_uint8*)dst + bytesToCopyNow);
src = (const void*)((const mal_uint8*)src + bytesToCopyNow);
}
#endif
}
void mal_zero_memory_64(void* dst, mal_uint64 sizeInBytes)
{
#if 0xFFFFFFFFFFFFFFFF <= SIZE_MAX
mal_zero_memory(dst, (size_t)sizeInBytes);
#else
while (sizeInBytes > 0) {
mal_uint64 bytesToZeroNow = sizeInBytes;
if (bytesToZeroNow > SIZE_MAX) {
bytesToZeroNow = SIZE_MAX;
}
mal_zero_memory(dst, (size_t)bytesToZeroNow); // Safe cast to size_t.
sizeInBytes -= bytesToZeroNow;
dst = (void*)((mal_uint8*)dst + bytesToZeroNow);
}
#endif
}
// u8
void mal_pcm_u8_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
(void)ditherMode;
mal_copy_memory(dst, src, count * sizeof(mal_uint8));
mal_copy_memory_64(dst, src, count * sizeof(mal_uint8));
}
......@@ -15803,7 +16099,7 @@ void mal_pcm_interleave_u8__optimized(void* dst, const void** src, mal_uint64 fr
const mal_uint8** src_u8 = (const mal_uint8**)src;
if (channels == 1) {
mal_copy_memory(dst, src[0], frameCount * sizeof(mal_uint8));
mal_copy_memory_64(dst, src[0], frameCount * sizeof(mal_uint8));
} else if (channels == 2) {
mal_uint64 iFrame;
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
......@@ -15906,7 +16202,7 @@ void mal_pcm_s16_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_
void mal_pcm_s16_to_s16(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
(void)ditherMode;
mal_copy_memory(dst, src, count * sizeof(mal_int16));
mal_copy_memory_64(dst, src, count * sizeof(mal_int16));
}
......@@ -16185,7 +16481,7 @@ void mal_pcm_s24_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither
{
(void)ditherMode;
mal_copy_memory(dst, src, count * 3);
mal_copy_memory_64(dst, src, count * 3);
}
......@@ -16472,7 +16768,7 @@ void mal_pcm_s32_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither
{
(void)ditherMode;
mal_copy_memory(dst, src, count * sizeof(mal_int32));
mal_copy_memory_64(dst, src, count * sizeof(mal_int32));
}
......@@ -16791,7 +17087,7 @@ void mal_pcm_f32_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither
{
(void)ditherMode;
mal_copy_memory(dst, src, count * sizeof(float));
mal_copy_memory_64(dst, src, count * sizeof(float));
}
......@@ -17214,6 +17510,40 @@ mal_uint64 mal_format_converter_read_deinterleaved(mal_format_converter* pConver
//
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Splits a buffer into parts of equal length and of the given alignment. The returned size of the split buffers will be a
// multiple of the alignment. The alignment must be a power of 2.
void mal_split_buffer(void* pBuffer, size_t bufferSize, size_t splitCount, size_t alignment, void** ppBuffersOut, size_t* pSplitSizeOut)
{
if (pBuffer == NULL || bufferSize == 0 || splitCount == 0) {
return;
}
if (alignment == 0) {
alignment = 1;
}
mal_uintptr pBufferUnaligned = (mal_uintptr)pBuffer;
mal_uintptr pBufferAligned = (pBufferUnaligned + (alignment-1)) & ~(alignment-1);
size_t unalignedBytes = (size_t)(pBufferAligned - pBufferUnaligned);
size_t splitSize = 0;
if (bufferSize >= unalignedBytes) {
splitSize = (bufferSize - unalignedBytes) / splitCount;
splitSize = splitSize & ~(alignment-1);
}
if (ppBuffersOut != NULL) {
for (size_t i = 0; i < splitCount; ++i) {
ppBuffersOut[i] = (mal_uint8*)(pBufferAligned + (splitSize*i));
}
}
if (pSplitSizeOut) {
*pSplitSizeOut = splitSize;
}
}
// -X = Left, +X = Right
// -Y = Bottom, +Y = Top
// -Z = Front, +Z = Back
......@@ -17456,6 +17786,12 @@ mal_result mal_channel_router_init(const mal_channel_router_config* pConfig, mal
pRouter->config = *pConfig;
// SIMD
pRouter->useSSE2 = mal_has_sse2() && !pConfig->noSSE2;
pRouter->useAVX = mal_has_avx() && !pConfig->noAVX;
pRouter->useAVX512 = mal_has_avx512f() && !pConfig->noAVX512;
pRouter->useNEON = mal_has_neon() && !pConfig->noNEON;
// If the input and output channels and channel maps are the same we should use a passthrough.
if (pRouter->config.channelsIn == pRouter->config.channelsOut) {
if (mal_channel_map_equal(pRouter->config.channelsIn, pRouter->config.channelMapIn, pRouter->config.channelMapOut)) {
......@@ -17631,6 +17967,26 @@ mal_result mal_channel_router_init(const mal_channel_router_config* pConfig, mal
return MAL_SUCCESS;
}
static MAL_INLINE mal_bool32 mal_channel_router__can_use_sse2(mal_channel_router* pRouter, const float* pSamplesOut, const float* pSamplesIn)
{
return pRouter->useSSE2 && (((mal_uintptr)pSamplesOut & 15) == 0) && (((mal_uintptr)pSamplesIn & 15) == 0);
}
static MAL_INLINE mal_bool32 mal_channel_router__can_use_avx(mal_channel_router* pRouter, const float* pSamplesOut, const float* pSamplesIn)
{
return pRouter->useAVX && (((mal_uintptr)pSamplesOut & 31) == 0) && (((mal_uintptr)pSamplesIn & 31) == 0);
}
static MAL_INLINE mal_bool32 mal_channel_router__can_use_avx512(mal_channel_router* pRouter, const float* pSamplesOut, const float* pSamplesIn)
{
return pRouter->useAVX512 && (((mal_uintptr)pSamplesOut & 63) == 0) && (((mal_uintptr)pSamplesIn & 63) == 0);
}
static MAL_INLINE mal_bool32 mal_channel_router__can_use_neon(mal_channel_router* pRouter, const float* pSamplesOut, const float* pSamplesIn)
{
return pRouter->useNEON && (((mal_uintptr)pSamplesOut & 15) == 0) && (((mal_uintptr)pSamplesIn & 15) == 0);
}
void mal_channel_router__do_routing(mal_channel_router* pRouter, mal_uint64 frameCount, float** ppSamplesOut, const float** ppSamplesIn)
{
mal_assert(pRouter != NULL);
......@@ -17641,20 +17997,83 @@ void mal_channel_router__do_routing(mal_channel_router* pRouter, mal_uint64 fram
mal_assert(pRouter->config.channelsIn == pRouter->config.channelsOut);
for (mal_uint32 iChannelIn = 0; iChannelIn < pRouter->config.channelsIn; ++iChannelIn) {
mal_uint32 iChannelOut = pRouter->shuffleTable[iChannelIn];
mal_copy_memory(ppSamplesOut[iChannelOut], ppSamplesIn[iChannelIn], frameCount * sizeof(float));
mal_copy_memory_64(ppSamplesOut[iChannelOut], ppSamplesIn[iChannelIn], frameCount * sizeof(float));
}
} else {
// This is the more complicated case. Each of the output channels is accumulated with 0 or more input channels.
// Clear.
for (mal_uint32 iChannelOut = 0; iChannelOut < pRouter->config.channelsOut; ++iChannelOut) {
mal_zero_memory(ppSamplesOut[iChannelOut], frameCount * sizeof(float));
mal_zero_memory_64(ppSamplesOut[iChannelOut], frameCount * sizeof(float));
}
// Accumulate.
for (mal_uint32 iChannelIn = 0; iChannelIn < pRouter->config.channelsIn; ++iChannelIn) {
for (mal_uint32 iChannelOut = 0; iChannelOut < pRouter->config.channelsOut; ++iChannelOut) {
for (mal_uint64 iFrame = 0; iFrame < frameCount; ++iFrame) {
mal_uint64 iFrame = 0;
#if defined(MAL_SUPPORT_AVX512)
if (mal_channel_router__can_use_avx512(pRouter, ppSamplesOut[iChannelOut], ppSamplesIn[iChannelIn])) {
__m512 weight = _mm512_set1_ps(pRouter->weights[iChannelIn][iChannelOut]);
mal_uint64 frameCount16 = frameCount/16;
for (mal_uint64 iFrame16 = 0; iFrame16 < frameCount16; iFrame16 += 1) {
__m512* pO = (__m512*)ppSamplesOut[iChannelOut] + iFrame16;
__m512* pI = (__m512*)ppSamplesIn [iChannelIn ] + iFrame16;
*pO = _mm512_add_ps(*pO, _mm512_mul_ps(*pI, weight));
}
iFrame += frameCount16*16;
}
else
#endif
#if defined(MAL_SUPPORT_AVX)
if (mal_channel_router__can_use_avx(pRouter, ppSamplesOut[iChannelOut], ppSamplesIn[iChannelIn])) {
__m256 weight = _mm256_set1_ps(pRouter->weights[iChannelIn][iChannelOut]);
mal_uint64 frameCount8 = frameCount/8;
for (mal_uint64 iFrame8 = 0; iFrame8 < frameCount8; iFrame8 += 1) {
__m256* pO = (__m256*)ppSamplesOut[iChannelOut] + iFrame8;
__m256* pI = (__m256*)ppSamplesIn [iChannelIn ] + iFrame8;
*pO = _mm256_add_ps(*pO, _mm256_mul_ps(*pI, weight));
}
iFrame += frameCount8*8;
}
else
#endif
#if defined(MAL_SUPPORT_SSE2)
if (mal_channel_router__can_use_sse2(pRouter, ppSamplesOut[iChannelOut], ppSamplesIn[iChannelIn])) {
__m128 weight = _mm_set1_ps(pRouter->weights[iChannelIn][iChannelOut]);
mal_uint64 frameCount4 = frameCount/4;
for (mal_uint64 iFrame4 = 0; iFrame4 < frameCount4; iFrame4 += 1) {
__m128* pO = (__m128*)ppSamplesOut[iChannelOut] + iFrame4;
__m128* pI = (__m128*)ppSamplesIn [iChannelIn ] + iFrame4;
*pO = _mm_add_ps(*pO, _mm_mul_ps(*pI, weight));
}
iFrame += frameCount4*4;
} else
#endif
{ // Reference.
float weight0 = pRouter->weights[iChannelIn][iChannelOut];
float weight1 = pRouter->weights[iChannelIn][iChannelOut];
float weight2 = pRouter->weights[iChannelIn][iChannelOut];
float weight3 = pRouter->weights[iChannelIn][iChannelOut];
mal_uint64 frameCount4 = frameCount/4;
for (mal_uint64 iFrame4 = 0; iFrame4 < frameCount4; iFrame4 += 1) {
ppSamplesOut[iChannelOut][iFrame+0] += ppSamplesIn[iChannelIn][iFrame+0] * weight0;
ppSamplesOut[iChannelOut][iFrame+1] += ppSamplesIn[iChannelIn][iFrame+1] * weight1;
ppSamplesOut[iChannelOut][iFrame+2] += ppSamplesIn[iChannelIn][iFrame+2] * weight2;
ppSamplesOut[iChannelOut][iFrame+3] += ppSamplesIn[iChannelIn][iFrame+3] * weight3;
iFrame += 4;
}
}
// Leftover.
for (; iFrame < frameCount; ++iFrame) {
ppSamplesOut[iChannelOut][iFrame] += ppSamplesIn[iChannelIn][iFrame] * pRouter->weights[iChannelIn][iChannelOut];
}
}
......@@ -17701,15 +18120,14 @@ mal_uint64 mal_channel_router_read_deinterleaved(mal_channel_router* pRouter, ma
float* ppNextSamplesOut[MAL_MAX_CHANNELS];
mal_copy_memory(ppNextSamplesOut, ppSamplesOut, sizeof(float*) * pRouter->config.channelsOut);
float temp[MAL_MAX_CHANNELS * 256];
MAL_ALIGN(MAL_SIMD_ALIGNMENT) float temp[MAL_MAX_CHANNELS * 256];
mal_assert(sizeof(temp) <= 0xFFFFFFFF);
mal_uint32 maxFramesToReadEachIteration = mal_countof(temp) / pRouter->config.channelsIn;
float* ppTemp[MAL_MAX_CHANNELS];
for (mal_uint32 iChannel = 0; iChannel < pRouter->config.channelsIn; iChannel += 1) {
ppTemp[iChannel] = temp + (maxFramesToReadEachIteration*iChannel);
}
size_t maxBytesToReadPerFrameEachIteration;
mal_split_buffer(temp, sizeof(temp), pRouter->config.channelsIn, MAL_SIMD_ALIGNMENT, (void**)&ppTemp, &maxBytesToReadPerFrameEachIteration);
size_t maxFramesToReadEachIteration = maxBytesToReadPerFrameEachIteration/sizeof(float);
mal_uint64 totalFramesRead = 0;
while (totalFramesRead < frameCount) {
......@@ -18073,7 +18491,7 @@ mal_uint64 mal_src_read_deinterleaved__linear(mal_src* pSRC, mal_uint64 frameCou
void mal_pcm_convert(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode)
{
if (formatOut == formatIn) {
mal_copy_memory(pOut, pIn, sampleCount * mal_get_bytes_per_sample(formatOut));
mal_copy_memory_64(pOut, pIn, sampleCount * mal_get_bytes_per_sample(formatOut));
return;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment