Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
M
miniaudio
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Locked Files
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Security & Compliance
Security & Compliance
Dependency List
License Compliance
Packages
Packages
List
Container Registry
Analytics
Analytics
CI / CD
Code Review
Insights
Issues
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
MyCard
miniaudio
Commits
4c4fe083
Commit
4c4fe083
authored
Apr 21, 2018
by
David Reid
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Early experimental SIMD work.
parent
f89296d7
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
442 additions
and
24 deletions
+442
-24
mini_al.h
mini_al.h
+442
-24
No files found.
mini_al.h
View file @
4c4fe083
...
@@ -47,6 +47,8 @@
...
@@ -47,6 +47,8 @@
// the development packages for any particular backend you can disable it by #define-ing the appropriate MAL_NO_*
// the development packages for any particular backend you can disable it by #define-ing the appropriate MAL_NO_*
// option before the implementation.
// option before the implementation.
//
//
// Note that GCC and Clang requires "-msse2", "-mavx", etc. for SIMD optimizations.
//
//
//
// Building for Windows
// Building for Windows
// --------------------
// --------------------
...
@@ -55,7 +57,7 @@
...
@@ -55,7 +57,7 @@
//
//
// Building for Linux
// Building for Linux
// ------------------
// ------------------
// The Linux build only requires linking to -ldl
and -lpthread
. You do not need any development packages for any
// The Linux build only requires linking to -ldl
, -lpthread and -lm
. You do not need any development packages for any
// of the supported backends.
// of the supported backends.
//
//
// Building for BSD
// Building for BSD
...
@@ -71,8 +73,7 @@
...
@@ -71,8 +73,7 @@
// Building for Emscripten
// Building for Emscripten
// -----------------------
// -----------------------
// The Emscripten build currently uses SDL 1.2 for it's backend which means specifying "-s USE_SDL=2" is unecessary
// The Emscripten build currently uses SDL 1.2 for it's backend which means specifying "-s USE_SDL=2" is unecessary
// as of this version. However, if in the future there is legitimate benefit or enough demand for SDL 2 to be used
// as of this version.
// instead, you will need to specify this when compiling.
//
//
//
//
// Playback Example
// Playback Example
...
@@ -200,7 +201,19 @@
...
@@ -200,7 +201,19 @@
// Disables the decoding APIs.
// Disables the decoding APIs.
//
//
// #define MAL_NO_STDIO
// #define MAL_NO_STDIO
// Disables file IO APIs
// Disables file IO APIs.
//
// #define MAL_NO_SSE2
// Disables SSE2 optimizations.
//
// #define MAL_NO_AVX
// Disables AVX optimizations.
//
// #define MAL_NO_AVX512
// Disables AVX-512 optimizations.
//
// #define MAL_NO_NEON
// Disables NEON optimizations.
#ifndef mini_al_h
#ifndef mini_al_h
#define mini_al_h
#define mini_al_h
...
@@ -791,6 +804,10 @@ typedef struct
...
@@ -791,6 +804,10 @@ typedef struct
mal_channel
channelMapIn
[
MAL_MAX_CHANNELS
];
mal_channel
channelMapIn
[
MAL_MAX_CHANNELS
];
mal_channel
channelMapOut
[
MAL_MAX_CHANNELS
];
mal_channel
channelMapOut
[
MAL_MAX_CHANNELS
];
mal_channel_mix_mode
mixingMode
;
mal_channel_mix_mode
mixingMode
;
mal_bool32
noSSE2
:
1
;
mal_bool32
noAVX
:
1
;
mal_bool32
noAVX512
:
1
;
mal_bool32
noNEON
:
1
;
mal_channel_router_read_deinterleaved_proc
onReadDeinterleaved
;
mal_channel_router_read_deinterleaved_proc
onReadDeinterleaved
;
void
*
pUserData
;
void
*
pUserData
;
}
mal_channel_router_config
;
}
mal_channel_router_config
;
...
@@ -800,6 +817,10 @@ struct mal_channel_router
...
@@ -800,6 +817,10 @@ struct mal_channel_router
mal_channel_router_config
config
;
mal_channel_router_config
config
;
mal_bool32
isPassthrough
:
1
;
mal_bool32
isPassthrough
:
1
;
mal_bool32
isSimpleShuffle
:
1
;
mal_bool32
isSimpleShuffle
:
1
;
mal_bool32
useSSE2
:
1
;
mal_bool32
useAVX
:
1
;
mal_bool32
useAVX512
:
1
;
mal_bool32
useNEON
:
1
;
mal_uint8
shuffleTable
[
MAL_MAX_CHANNELS
];
mal_uint8
shuffleTable
[
MAL_MAX_CHANNELS
];
float
weights
[
MAL_MAX_CHANNELS
][
MAL_MAX_CHANNELS
];
float
weights
[
MAL_MAX_CHANNELS
][
MAL_MAX_CHANNELS
];
};
};
...
@@ -2291,6 +2312,241 @@ mal_uint64 mal_sine_wave_read(mal_sine_wave* pSignWave, mal_uint64 count, float*
...
@@ -2291,6 +2312,241 @@ mal_uint64 mal_sine_wave_read(mal_sine_wave* pSignWave, mal_uint64 count, float*
#endif
#endif
#endif
#endif
// Architecture Detection
#if defined(__x86_64__) || defined(_M_X64)
#define MAL_X64
#elif defined(__i386) || defined(_M_IX86)
#define MAL_X86
#elif defined(__arm__) || defined(_M_ARM)
#define MAL_ARM
#endif
// Intrinsics Support
#if defined(MAL_X64) || defined(MAL_X86)
#if defined(_MSC_VER)
// MSVC.
#if !defined(MAL_NO_SSE2) // Assume all MSVC compilers support SSE2 intrinsics.
#define MAL_SUPPORT_SSE2
#endif
#if _MSC_VER >= 1600 && !defined(MAL_NO_AVX) // 2010
#define MAL_SUPPORT_AVX
#endif
#if _MSC_VER >= 1910 && !defined(MAL_NO_AVX512) // 2017
#define MAL_SUPPORT_AVX512
#endif
#else
// Assume GNUC-style.
#if defined(__SSE2__) && !defined(MAL_NO_SSE2)
#define MAL_SUPPORT_SSE2
#endif
#if defined(__AVX__) && !defined(MAL_NO_AVX)
#define MAL_SUPPORT_AVX
#endif
#if defined(__AVX512F__) && !defined(MAL_NO_AVX512)
#define MAL_SUPPORT_AVX512
#endif
#endif
// If at this point we still haven't determined compiler support for the intrinsics just fall back to __has_include.
#if !defined(__GNUC__) && defined(__has_include)
#if !defined(MAL_SUPPORT_SSE2) && !defined(MAL_NO_SSE2) && __has_include(<emmintrin.h>)
#define MAL_SUPPORT_SSE2
#endif
#if !defined(MAL_SUPPORT_AVX) && !defined(MAL_NO_AVX) && __has_include(<immintrin.h>)
#define MAL_SUPPORT_AVX
#endif
#if !defined(MAL_SUPPORT_AVX512) && !defined(MAL_NO_AVX512) && __has_include(<zmmintrin.h>)
#define MAL_SUPPORT_AVX512
#endif
#endif
#if defined(MAL_SUPPORT_AVX512)
#include <immintrin.h> // Not a mistake. Intentionally including <immintrin.h> instead of <zmmintrin.h> because otherwise the compiler will complain.
#elif defined(MAL_SUPPORT_AVX)
#include <immintrin.h>
#elif defined(MAL_SUPPORT_SSE2)
#include <emmintrin.h>
#endif
#endif
#if defined(MAL_ARM)
#if !defined(MAL_NO_NEON) && (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64))
#define MAL_SUPPORT_NEON
#endif
// Fall back to looking for the #include file.
#if !defined(__GNUC__) && defined(__has_include)
#if !defined(MAL_SUPPORT_NEON) && !defined(MAL_NO_NEON) && __has_include(<arm_neon.h>)
#define MAL_SUPPORT_NEON
#endif
#endif
#if defined(MAL_SUPPORT_NEON)
#include <arm_neon.h>
#endif
#endif
#if defined(MAL_X64) || defined(MAL_X86)
#if defined(_MSC_VER)
#if _MSC_VER >= 1400
#include <intrin.h>
static
MAL_INLINE
void
mal_cpuid
(
int
info
[
4
],
int
fid
)
{
__cpuid
(
info
,
fid
);
}
#else
#define MAL_NO_CPUID
#endif
#if _MSC_VER >= 1600
static
MAL_INLINE
unsigned
__int64
mal_xgetbv
(
int
reg
)
{
return
_xgetbv
(
reg
);
}
#else
#define MAL_NO_XGETBV
#endif
#elif defined(__GNUC__) || defined(__clang__)
static
MAL_INLINE
void
mal_cpuid
(
int
info
[
4
],
int
fid
)
{
asm
(
"movl %[fid], %%eax
\n\t
"
"cpuid
\n\t
"
"movl %%eax, %[info0]
\n\t
"
"movl %%ebx, %[info1]
\n\t
"
"movl %%ecx, %[info2]
\n\t
"
"movl %%edx, %[info3]
\n\t
"
:
[
info0
]
"=rm"
(
info
[
0
]),
[
info1
]
"=rm"
(
info
[
1
]),
[
info2
]
"=rm"
(
info
[
2
]),
[
info3
]
"=rm"
(
info
[
3
])
:
[
fid
]
"rm"
(
fid
)
:
"eax"
,
"ebx"
,
"ecx"
,
"edx"
);
}
static
MAL_INLINE
unsigned
long
long
mal_xgetbv
(
int
reg
)
{
unsigned
int
hi
;
unsigned
int
lo
;
asm
(
"movl %[reg], %%ecx
\n\t
"
"xgetbv
\n\t
"
"movl %%eax, %[lo]
\n\t
"
"movl %%edx, %[hi]
\n\t
"
:
[
lo
]
"=rm"
(
lo
),
[
hi
]
"=rm"
(
hi
)
:
[
reg
]
"rm"
(
reg
)
:
"eax"
,
"ecx"
,
"edx"
);
return
((
unsigned
long
long
)
hi
<<
32ULL
)
|
(
unsigned
long
long
)
lo
;
}
#else
#define MAL_NO_CPUID
#define MAL_NO_XGETBV
#endif
#else
#define MAL_NO_CPUID
#define MAL_NO_XGETBV
#endif
static
MAL_INLINE
mal_bool32
mal_has_sse2
()
{
#if (defined(MAL_X64) || defined(MAL_X86)) && !defined(MAL_NO_SSE2)
#if defined(MAL_X64)
return
MAL_TRUE
;
// 64-bit targets always support SSE2.
#elif (defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(__SSE2__)
return
MAL_TRUE
;
// If the compiler is allowed to freely generate SSE2 code we can assume support.
#else
#if defined(MAL_NO_CPUID)
return
MAL_FALSE
;
#else
int
info
[
4
];
mal_cpuid
(
info
,
1
);
return
(
info
[
3
]
&
(
1
<<
26
))
!=
0
;
#endif
#endif
#else
return
MAL_FALSE
;
// SSE2 is only supported on x86 and x64 architectures.
#endif
}
static
MAL_INLINE
mal_bool32
mal_has_avx
()
{
#if (defined(MAL_X64) || defined(MAL_X86)) && !defined(MAL_NO_AVX)
#if defined(_AVX_) || defined(__AVX__)
return
MAL_TRUE
;
// If the compiler is allowed to freely generate AVX code we can assume support.
#else
// AVX requires both CPU and OS support.
#if defined(MAL_NO_CPUID) || defined(MAL_NO_XGETBV)
return
MAL_FALSE
;
#else
int
info
[
4
];
mal_cpuid
(
info
,
1
);
if
(((
info
[
2
]
&
(
1
<<
27
))
!=
0
)
&&
((
info
[
2
]
&
(
1
<<
28
))
!=
0
))
{
mal_uint64
xrc
=
mal_xgetbv
(
0
);
if
((
xrc
&
0x06
)
==
0x06
)
{
return
MAL_TRUE
;
}
else
{
return
MAL_FALSE
;
}
}
else
{
return
MAL_FALSE
;
}
#endif
#endif
#else
return
MAL_FALSE
;
// AVX is only supported on x86 and x64 architectures.
#endif
}
static
MAL_INLINE
mal_bool32
mal_has_avx512f
()
{
#if (defined(MAL_X64) || defined(MAL_X86)) && !defined(MAL_NO_AVX512)
#if defined(__AVX512F__)
return
MAL_TRUE
;
// If the compiler is allowed to freely generate AVX-512F code we can assume support.
#else
// AVX-512 requires both CPU and OS support.
#if defined(MAL_NO_CPUID) || defined(MAL_NO_XGETBV)
return
MAL_FALSE
;
#else
int
info
[
4
];
mal_cpuid
(
info
,
1
);
if
(((
info
[
2
]
&
(
1
<<
27
))
!=
0
)
&&
((
info
[
1
]
&
(
1
<<
16
))
!=
0
))
{
mal_uint64
xrc
=
mal_xgetbv
(
0
);
if
((
xrc
&
0xE6
)
==
0xE6
)
{
return
MAL_TRUE
;
}
else
{
return
MAL_FALSE
;
}
}
else
{
return
MAL_FALSE
;
}
#endif
#endif
#else
return
MAL_FALSE
;
// AVX-512F is only supported on x86 and x64 architectures.
#endif
}
static
MAL_INLINE
mal_bool32
mal_has_neon
()
{
#if defined(MAL_ARM) && !defined(MAL_NO_NEON)
#if (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64))
return
MAL_TRUE
;
// If the compiler is allowed to freely generate NEON code we can assume support.
#else
// TODO: Runtime check.
return
MAL_FALSE
;
#endif
#else
return
MAL_FALSE
;
// NEON is only supported on ARM architectures.
#endif
}
#ifndef MAL_PI
#ifndef MAL_PI
#define MAL_PI 3.14159265358979323846264f
#define MAL_PI 3.14159265358979323846264f
#endif
#endif
...
@@ -2300,9 +2556,9 @@ mal_uint64 mal_sine_wave_read(mal_sine_wave* pSignWave, mal_uint64 count, float*
...
@@ -2300,9 +2556,9 @@ mal_uint64 mal_sine_wave_read(mal_sine_wave* pSignWave, mal_uint64 count, float*
// Unfortunately using runtime linking for pthreads causes problems. This has occurred for me when testing on FreeBSD. When
// Unfortunately using runtime linking for pthreads causes problems. This has occurred for me when testing on FreeBSD. When
// using runtime linking, deadlocks can occur (for me it happens when loading data from fread()). It turns out that doing
// using runtime linking, deadlocks can occur (for me it happens when loading data from fread()). It turns out that doing
// compile-time linking fixes this. I'm not sure why this happens, but th
is is the safest way I can think of to continue. To
// compile-time linking fixes this. I'm not sure why this happens, but th
e safest way I can think of to fix this is to simply
//
enable runtime linking, #define this before the implementation of this file. I am not officially supporting this, but I'
m
//
disable runtime linking by default. To enable runtime linking, #define this before the implementation of this file. I a
m
// leaving it here in case it's useful for somebody, somewhere.
//
not officially supporting this, but I'm
leaving it here in case it's useful for somebody, somewhere.
//#define MAL_USE_RUNTIME_LINKING_FOR_PTHREAD
//#define MAL_USE_RUNTIME_LINKING_FOR_PTHREAD
// Disable run-time linking on certain backends.
// Disable run-time linking on certain backends.
...
@@ -15601,13 +15857,53 @@ mal_bool32 mal_channel_map_contains_channel_position(mal_uint32 channels, const
...
@@ -15601,13 +15857,53 @@ mal_bool32 mal_channel_map_contains_channel_position(mal_uint32 channels, const
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//#define MAL_USE_REFERENCE_CONVERSION_APIS 1
//#define MAL_USE_REFERENCE_CONVERSION_APIS 1
#define MAL_USE_SSE
//#define MAL_USE_SSE
void
mal_copy_memory_64
(
void
*
dst
,
const
void
*
src
,
mal_uint64
sizeInBytes
)
{
#if 0xFFFFFFFFFFFFFFFF <= SIZE_MAX
mal_copy_memory(dst, src, (size_t)sizeInBytes);
#else
while
(
sizeInBytes
>
0
)
{
mal_uint64
bytesToCopyNow
=
sizeInBytes
;
if
(
bytesToCopyNow
>
SIZE_MAX
)
{
bytesToCopyNow
=
SIZE_MAX
;
}
mal_copy_memory
(
dst
,
src
,
(
size_t
)
bytesToCopyNow
);
// Safe cast to size_t.
sizeInBytes
-=
bytesToCopyNow
;
dst
=
(
void
*
)((
mal_uint8
*
)
dst
+
bytesToCopyNow
);
src
=
(
const
void
*
)((
const
mal_uint8
*
)
src
+
bytesToCopyNow
);
}
#endif
}
void
mal_zero_memory_64
(
void
*
dst
,
mal_uint64
sizeInBytes
)
{
#if 0xFFFFFFFFFFFFFFFF <= SIZE_MAX
mal_zero_memory(dst, (size_t)sizeInBytes);
#else
while
(
sizeInBytes
>
0
)
{
mal_uint64
bytesToZeroNow
=
sizeInBytes
;
if
(
bytesToZeroNow
>
SIZE_MAX
)
{
bytesToZeroNow
=
SIZE_MAX
;
}
mal_zero_memory
(
dst
,
(
size_t
)
bytesToZeroNow
);
// Safe cast to size_t.
sizeInBytes
-=
bytesToZeroNow
;
dst
=
(
void
*
)((
mal_uint8
*
)
dst
+
bytesToZeroNow
);
}
#endif
}
// u8
// u8
void
mal_pcm_u8_to_u8
(
void
*
dst
,
const
void
*
src
,
mal_uint64
count
,
mal_dither_mode
ditherMode
)
void
mal_pcm_u8_to_u8
(
void
*
dst
,
const
void
*
src
,
mal_uint64
count
,
mal_dither_mode
ditherMode
)
{
{
(
void
)
ditherMode
;
(
void
)
ditherMode
;
mal_copy_memory
(
dst
,
src
,
count
*
sizeof
(
mal_uint8
));
mal_copy_memory
_64
(
dst
,
src
,
count
*
sizeof
(
mal_uint8
));
}
}
...
@@ -15803,7 +16099,7 @@ void mal_pcm_interleave_u8__optimized(void* dst, const void** src, mal_uint64 fr
...
@@ -15803,7 +16099,7 @@ void mal_pcm_interleave_u8__optimized(void* dst, const void** src, mal_uint64 fr
const
mal_uint8
**
src_u8
=
(
const
mal_uint8
**
)
src
;
const
mal_uint8
**
src_u8
=
(
const
mal_uint8
**
)
src
;
if
(
channels
==
1
)
{
if
(
channels
==
1
)
{
mal_copy_memory
(
dst
,
src
[
0
],
frameCount
*
sizeof
(
mal_uint8
));
mal_copy_memory
_64
(
dst
,
src
[
0
],
frameCount
*
sizeof
(
mal_uint8
));
}
else
if
(
channels
==
2
)
{
}
else
if
(
channels
==
2
)
{
mal_uint64
iFrame
;
mal_uint64
iFrame
;
for
(
iFrame
=
0
;
iFrame
<
frameCount
;
iFrame
+=
1
)
{
for
(
iFrame
=
0
;
iFrame
<
frameCount
;
iFrame
+=
1
)
{
...
@@ -15906,7 +16202,7 @@ void mal_pcm_s16_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_
...
@@ -15906,7 +16202,7 @@ void mal_pcm_s16_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_
void
mal_pcm_s16_to_s16
(
void
*
dst
,
const
void
*
src
,
mal_uint64
count
,
mal_dither_mode
ditherMode
)
void
mal_pcm_s16_to_s16
(
void
*
dst
,
const
void
*
src
,
mal_uint64
count
,
mal_dither_mode
ditherMode
)
{
{
(
void
)
ditherMode
;
(
void
)
ditherMode
;
mal_copy_memory
(
dst
,
src
,
count
*
sizeof
(
mal_int16
));
mal_copy_memory
_64
(
dst
,
src
,
count
*
sizeof
(
mal_int16
));
}
}
...
@@ -16185,7 +16481,7 @@ void mal_pcm_s24_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither
...
@@ -16185,7 +16481,7 @@ void mal_pcm_s24_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither
{
{
(
void
)
ditherMode
;
(
void
)
ditherMode
;
mal_copy_memory
(
dst
,
src
,
count
*
3
);
mal_copy_memory
_64
(
dst
,
src
,
count
*
3
);
}
}
...
@@ -16472,7 +16768,7 @@ void mal_pcm_s32_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither
...
@@ -16472,7 +16768,7 @@ void mal_pcm_s32_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither
{
{
(
void
)
ditherMode
;
(
void
)
ditherMode
;
mal_copy_memory
(
dst
,
src
,
count
*
sizeof
(
mal_int32
));
mal_copy_memory
_64
(
dst
,
src
,
count
*
sizeof
(
mal_int32
));
}
}
...
@@ -16791,7 +17087,7 @@ void mal_pcm_f32_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither
...
@@ -16791,7 +17087,7 @@ void mal_pcm_f32_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither
{
{
(
void
)
ditherMode
;
(
void
)
ditherMode
;
mal_copy_memory
(
dst
,
src
,
count
*
sizeof
(
float
));
mal_copy_memory
_64
(
dst
,
src
,
count
*
sizeof
(
float
));
}
}
...
@@ -17214,6 +17510,40 @@ mal_uint64 mal_format_converter_read_deinterleaved(mal_format_converter* pConver
...
@@ -17214,6 +17510,40 @@ mal_uint64 mal_format_converter_read_deinterleaved(mal_format_converter* pConver
//
//
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Splits a buffer into parts of equal length and of the given alignment. The returned size of the split buffers will be a
// multiple of the alignment. The alignment must be a power of 2.
void
mal_split_buffer
(
void
*
pBuffer
,
size_t
bufferSize
,
size_t
splitCount
,
size_t
alignment
,
void
**
ppBuffersOut
,
size_t
*
pSplitSizeOut
)
{
if
(
pBuffer
==
NULL
||
bufferSize
==
0
||
splitCount
==
0
)
{
return
;
}
if
(
alignment
==
0
)
{
alignment
=
1
;
}
mal_uintptr
pBufferUnaligned
=
(
mal_uintptr
)
pBuffer
;
mal_uintptr
pBufferAligned
=
(
pBufferUnaligned
+
(
alignment
-
1
))
&
~
(
alignment
-
1
);
size_t
unalignedBytes
=
(
size_t
)(
pBufferAligned
-
pBufferUnaligned
);
size_t
splitSize
=
0
;
if
(
bufferSize
>=
unalignedBytes
)
{
splitSize
=
(
bufferSize
-
unalignedBytes
)
/
splitCount
;
splitSize
=
splitSize
&
~
(
alignment
-
1
);
}
if
(
ppBuffersOut
!=
NULL
)
{
for
(
size_t
i
=
0
;
i
<
splitCount
;
++
i
)
{
ppBuffersOut
[
i
]
=
(
mal_uint8
*
)(
pBufferAligned
+
(
splitSize
*
i
));
}
}
if
(
pSplitSizeOut
)
{
*
pSplitSizeOut
=
splitSize
;
}
}
// -X = Left, +X = Right
// -X = Left, +X = Right
// -Y = Bottom, +Y = Top
// -Y = Bottom, +Y = Top
// -Z = Front, +Z = Back
// -Z = Front, +Z = Back
...
@@ -17456,6 +17786,12 @@ mal_result mal_channel_router_init(const mal_channel_router_config* pConfig, mal
...
@@ -17456,6 +17786,12 @@ mal_result mal_channel_router_init(const mal_channel_router_config* pConfig, mal
pRouter
->
config
=
*
pConfig
;
pRouter
->
config
=
*
pConfig
;
// SIMD
pRouter
->
useSSE2
=
mal_has_sse2
()
&&
!
pConfig
->
noSSE2
;
pRouter
->
useAVX
=
mal_has_avx
()
&&
!
pConfig
->
noAVX
;
pRouter
->
useAVX512
=
mal_has_avx512f
()
&&
!
pConfig
->
noAVX512
;
pRouter
->
useNEON
=
mal_has_neon
()
&&
!
pConfig
->
noNEON
;
// If the input and output channels and channel maps are the same we should use a passthrough.
// If the input and output channels and channel maps are the same we should use a passthrough.
if
(
pRouter
->
config
.
channelsIn
==
pRouter
->
config
.
channelsOut
)
{
if
(
pRouter
->
config
.
channelsIn
==
pRouter
->
config
.
channelsOut
)
{
if
(
mal_channel_map_equal
(
pRouter
->
config
.
channelsIn
,
pRouter
->
config
.
channelMapIn
,
pRouter
->
config
.
channelMapOut
))
{
if
(
mal_channel_map_equal
(
pRouter
->
config
.
channelsIn
,
pRouter
->
config
.
channelMapIn
,
pRouter
->
config
.
channelMapOut
))
{
...
@@ -17631,6 +17967,26 @@ mal_result mal_channel_router_init(const mal_channel_router_config* pConfig, mal
...
@@ -17631,6 +17967,26 @@ mal_result mal_channel_router_init(const mal_channel_router_config* pConfig, mal
return
MAL_SUCCESS
;
return
MAL_SUCCESS
;
}
}
static
MAL_INLINE
mal_bool32
mal_channel_router__can_use_sse2
(
mal_channel_router
*
pRouter
,
const
float
*
pSamplesOut
,
const
float
*
pSamplesIn
)
{
return
pRouter
->
useSSE2
&&
(((
mal_uintptr
)
pSamplesOut
&
15
)
==
0
)
&&
(((
mal_uintptr
)
pSamplesIn
&
15
)
==
0
);
}
static
MAL_INLINE
mal_bool32
mal_channel_router__can_use_avx
(
mal_channel_router
*
pRouter
,
const
float
*
pSamplesOut
,
const
float
*
pSamplesIn
)
{
return
pRouter
->
useAVX
&&
(((
mal_uintptr
)
pSamplesOut
&
31
)
==
0
)
&&
(((
mal_uintptr
)
pSamplesIn
&
31
)
==
0
);
}
static
MAL_INLINE
mal_bool32
mal_channel_router__can_use_avx512
(
mal_channel_router
*
pRouter
,
const
float
*
pSamplesOut
,
const
float
*
pSamplesIn
)
{
return
pRouter
->
useAVX512
&&
(((
mal_uintptr
)
pSamplesOut
&
63
)
==
0
)
&&
(((
mal_uintptr
)
pSamplesIn
&
63
)
==
0
);
}
static
MAL_INLINE
mal_bool32
mal_channel_router__can_use_neon
(
mal_channel_router
*
pRouter
,
const
float
*
pSamplesOut
,
const
float
*
pSamplesIn
)
{
return
pRouter
->
useNEON
&&
(((
mal_uintptr
)
pSamplesOut
&
15
)
==
0
)
&&
(((
mal_uintptr
)
pSamplesIn
&
15
)
==
0
);
}
void
mal_channel_router__do_routing
(
mal_channel_router
*
pRouter
,
mal_uint64
frameCount
,
float
**
ppSamplesOut
,
const
float
**
ppSamplesIn
)
void
mal_channel_router__do_routing
(
mal_channel_router
*
pRouter
,
mal_uint64
frameCount
,
float
**
ppSamplesOut
,
const
float
**
ppSamplesIn
)
{
{
mal_assert
(
pRouter
!=
NULL
);
mal_assert
(
pRouter
!=
NULL
);
...
@@ -17641,20 +17997,83 @@ void mal_channel_router__do_routing(mal_channel_router* pRouter, mal_uint64 fram
...
@@ -17641,20 +17997,83 @@ void mal_channel_router__do_routing(mal_channel_router* pRouter, mal_uint64 fram
mal_assert
(
pRouter
->
config
.
channelsIn
==
pRouter
->
config
.
channelsOut
);
mal_assert
(
pRouter
->
config
.
channelsIn
==
pRouter
->
config
.
channelsOut
);
for
(
mal_uint32
iChannelIn
=
0
;
iChannelIn
<
pRouter
->
config
.
channelsIn
;
++
iChannelIn
)
{
for
(
mal_uint32
iChannelIn
=
0
;
iChannelIn
<
pRouter
->
config
.
channelsIn
;
++
iChannelIn
)
{
mal_uint32
iChannelOut
=
pRouter
->
shuffleTable
[
iChannelIn
];
mal_uint32
iChannelOut
=
pRouter
->
shuffleTable
[
iChannelIn
];
mal_copy_memory
(
ppSamplesOut
[
iChannelOut
],
ppSamplesIn
[
iChannelIn
],
frameCount
*
sizeof
(
float
));
mal_copy_memory
_64
(
ppSamplesOut
[
iChannelOut
],
ppSamplesIn
[
iChannelIn
],
frameCount
*
sizeof
(
float
));
}
}
}
else
{
}
else
{
// This is the more complicated case. Each of the output channels is accumulated with 0 or more input channels.
// This is the more complicated case. Each of the output channels is accumulated with 0 or more input channels.
// Clear.
// Clear.
for
(
mal_uint32
iChannelOut
=
0
;
iChannelOut
<
pRouter
->
config
.
channelsOut
;
++
iChannelOut
)
{
for
(
mal_uint32
iChannelOut
=
0
;
iChannelOut
<
pRouter
->
config
.
channelsOut
;
++
iChannelOut
)
{
mal_zero_memory
(
ppSamplesOut
[
iChannelOut
],
frameCount
*
sizeof
(
float
));
mal_zero_memory
_64
(
ppSamplesOut
[
iChannelOut
],
frameCount
*
sizeof
(
float
));
}
}
// Accumulate.
// Accumulate.
for
(
mal_uint32
iChannelIn
=
0
;
iChannelIn
<
pRouter
->
config
.
channelsIn
;
++
iChannelIn
)
{
for
(
mal_uint32
iChannelIn
=
0
;
iChannelIn
<
pRouter
->
config
.
channelsIn
;
++
iChannelIn
)
{
for
(
mal_uint32
iChannelOut
=
0
;
iChannelOut
<
pRouter
->
config
.
channelsOut
;
++
iChannelOut
)
{
for
(
mal_uint32
iChannelOut
=
0
;
iChannelOut
<
pRouter
->
config
.
channelsOut
;
++
iChannelOut
)
{
for
(
mal_uint64
iFrame
=
0
;
iFrame
<
frameCount
;
++
iFrame
)
{
mal_uint64
iFrame
=
0
;
#if defined(MAL_SUPPORT_AVX512)
if
(
mal_channel_router__can_use_avx512
(
pRouter
,
ppSamplesOut
[
iChannelOut
],
ppSamplesIn
[
iChannelIn
]))
{
__m512
weight
=
_mm512_set1_ps
(
pRouter
->
weights
[
iChannelIn
][
iChannelOut
]);
mal_uint64
frameCount16
=
frameCount
/
16
;
for
(
mal_uint64
iFrame16
=
0
;
iFrame16
<
frameCount16
;
iFrame16
+=
1
)
{
__m512
*
pO
=
(
__m512
*
)
ppSamplesOut
[
iChannelOut
]
+
iFrame16
;
__m512
*
pI
=
(
__m512
*
)
ppSamplesIn
[
iChannelIn
]
+
iFrame16
;
*
pO
=
_mm512_add_ps
(
*
pO
,
_mm512_mul_ps
(
*
pI
,
weight
));
}
iFrame
+=
frameCount16
*
16
;
}
else
#endif
#if defined(MAL_SUPPORT_AVX)
if
(
mal_channel_router__can_use_avx
(
pRouter
,
ppSamplesOut
[
iChannelOut
],
ppSamplesIn
[
iChannelIn
]))
{
__m256
weight
=
_mm256_set1_ps
(
pRouter
->
weights
[
iChannelIn
][
iChannelOut
]);
mal_uint64
frameCount8
=
frameCount
/
8
;
for
(
mal_uint64
iFrame8
=
0
;
iFrame8
<
frameCount8
;
iFrame8
+=
1
)
{
__m256
*
pO
=
(
__m256
*
)
ppSamplesOut
[
iChannelOut
]
+
iFrame8
;
__m256
*
pI
=
(
__m256
*
)
ppSamplesIn
[
iChannelIn
]
+
iFrame8
;
*
pO
=
_mm256_add_ps
(
*
pO
,
_mm256_mul_ps
(
*
pI
,
weight
));
}
iFrame
+=
frameCount8
*
8
;
}
else
#endif
#if defined(MAL_SUPPORT_SSE2)
if
(
mal_channel_router__can_use_sse2
(
pRouter
,
ppSamplesOut
[
iChannelOut
],
ppSamplesIn
[
iChannelIn
]))
{
__m128
weight
=
_mm_set1_ps
(
pRouter
->
weights
[
iChannelIn
][
iChannelOut
]);
mal_uint64
frameCount4
=
frameCount
/
4
;
for
(
mal_uint64
iFrame4
=
0
;
iFrame4
<
frameCount4
;
iFrame4
+=
1
)
{
__m128
*
pO
=
(
__m128
*
)
ppSamplesOut
[
iChannelOut
]
+
iFrame4
;
__m128
*
pI
=
(
__m128
*
)
ppSamplesIn
[
iChannelIn
]
+
iFrame4
;
*
pO
=
_mm_add_ps
(
*
pO
,
_mm_mul_ps
(
*
pI
,
weight
));
}
iFrame
+=
frameCount4
*
4
;
}
else
#endif
{
// Reference.
float
weight0
=
pRouter
->
weights
[
iChannelIn
][
iChannelOut
];
float
weight1
=
pRouter
->
weights
[
iChannelIn
][
iChannelOut
];
float
weight2
=
pRouter
->
weights
[
iChannelIn
][
iChannelOut
];
float
weight3
=
pRouter
->
weights
[
iChannelIn
][
iChannelOut
];
mal_uint64
frameCount4
=
frameCount
/
4
;
for
(
mal_uint64
iFrame4
=
0
;
iFrame4
<
frameCount4
;
iFrame4
+=
1
)
{
ppSamplesOut
[
iChannelOut
][
iFrame
+
0
]
+=
ppSamplesIn
[
iChannelIn
][
iFrame
+
0
]
*
weight0
;
ppSamplesOut
[
iChannelOut
][
iFrame
+
1
]
+=
ppSamplesIn
[
iChannelIn
][
iFrame
+
1
]
*
weight1
;
ppSamplesOut
[
iChannelOut
][
iFrame
+
2
]
+=
ppSamplesIn
[
iChannelIn
][
iFrame
+
2
]
*
weight2
;
ppSamplesOut
[
iChannelOut
][
iFrame
+
3
]
+=
ppSamplesIn
[
iChannelIn
][
iFrame
+
3
]
*
weight3
;
iFrame
+=
4
;
}
}
// Leftover.
for
(;
iFrame
<
frameCount
;
++
iFrame
)
{
ppSamplesOut
[
iChannelOut
][
iFrame
]
+=
ppSamplesIn
[
iChannelIn
][
iFrame
]
*
pRouter
->
weights
[
iChannelIn
][
iChannelOut
];
ppSamplesOut
[
iChannelOut
][
iFrame
]
+=
ppSamplesIn
[
iChannelIn
][
iFrame
]
*
pRouter
->
weights
[
iChannelIn
][
iChannelOut
];
}
}
}
}
...
@@ -17701,15 +18120,14 @@ mal_uint64 mal_channel_router_read_deinterleaved(mal_channel_router* pRouter, ma
...
@@ -17701,15 +18120,14 @@ mal_uint64 mal_channel_router_read_deinterleaved(mal_channel_router* pRouter, ma
float
*
ppNextSamplesOut
[
MAL_MAX_CHANNELS
];
float
*
ppNextSamplesOut
[
MAL_MAX_CHANNELS
];
mal_copy_memory
(
ppNextSamplesOut
,
ppSamplesOut
,
sizeof
(
float
*
)
*
pRouter
->
config
.
channelsOut
);
mal_copy_memory
(
ppNextSamplesOut
,
ppSamplesOut
,
sizeof
(
float
*
)
*
pRouter
->
config
.
channelsOut
);
float
temp
[
MAL_MAX_CHANNELS
*
256
];
MAL_ALIGN
(
MAL_SIMD_ALIGNMENT
)
float
temp
[
MAL_MAX_CHANNELS
*
256
];
mal_assert
(
sizeof
(
temp
)
<=
0xFFFFFFFF
);
mal_assert
(
sizeof
(
temp
)
<=
0xFFFFFFFF
);
mal_uint32
maxFramesToReadEachIteration
=
mal_countof
(
temp
)
/
pRouter
->
config
.
channelsIn
;
float
*
ppTemp
[
MAL_MAX_CHANNELS
];
float
*
ppTemp
[
MAL_MAX_CHANNELS
];
for
(
mal_uint32
iChannel
=
0
;
iChannel
<
pRouter
->
config
.
channelsIn
;
iChannel
+=
1
)
{
size_t
maxBytesToReadPerFrameEachIteration
;
ppTemp
[
iChannel
]
=
temp
+
(
maxFramesToReadEachIteration
*
iChannel
);
mal_split_buffer
(
temp
,
sizeof
(
temp
),
pRouter
->
config
.
channelsIn
,
MAL_SIMD_ALIGNMENT
,
(
void
**
)
&
ppTemp
,
&
maxBytesToReadPerFrameEachIteration
);
}
size_t
maxFramesToReadEachIteration
=
maxBytesToReadPerFrameEachIteration
/
sizeof
(
float
);
mal_uint64
totalFramesRead
=
0
;
mal_uint64
totalFramesRead
=
0
;
while
(
totalFramesRead
<
frameCount
)
{
while
(
totalFramesRead
<
frameCount
)
{
...
@@ -18073,7 +18491,7 @@ mal_uint64 mal_src_read_deinterleaved__linear(mal_src* pSRC, mal_uint64 frameCou
...
@@ -18073,7 +18491,7 @@ mal_uint64 mal_src_read_deinterleaved__linear(mal_src* pSRC, mal_uint64 frameCou
void
mal_pcm_convert
(
void
*
pOut
,
mal_format
formatOut
,
const
void
*
pIn
,
mal_format
formatIn
,
mal_uint64
sampleCount
,
mal_dither_mode
ditherMode
)
void
mal_pcm_convert
(
void
*
pOut
,
mal_format
formatOut
,
const
void
*
pIn
,
mal_format
formatIn
,
mal_uint64
sampleCount
,
mal_dither_mode
ditherMode
)
{
{
if
(
formatOut
==
formatIn
)
{
if
(
formatOut
==
formatIn
)
{
mal_copy_memory
(
pOut
,
pIn
,
sampleCount
*
mal_get_bytes_per_sample
(
formatOut
));
mal_copy_memory
_64
(
pOut
,
pIn
,
sampleCount
*
mal_get_bytes_per_sample
(
formatOut
));
return
;
return
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment