Commit ef2ad300 authored by David Reid's avatar David Reid

Add AVX optimized f32 -> s16 conversion.

parent b2815ccf
......@@ -17288,6 +17288,24 @@ void mal_pcm_u8_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_d
mal_pcm_u8_to_s16__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_u8_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_s16__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX512)
void mal_pcm_u8_to_s16__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_s16__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
void mal_pcm_u8_to_s16__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_s16__optimized(dst, src, count, ditherMode);
}
#endif
void mal_pcm_u8_to_s16(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
......@@ -17328,18 +17346,32 @@ void mal_pcm_u8_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_d
mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_u8_to_s24__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX512)
void mal_pcm_u8_to_s24__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
void mal_pcm_u8_to_s24__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode);
}
#endif
void mal_pcm_u8_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_u8_to_s24__reference(dst, src, count, ditherMode);
#else
#if defined(MAL_SUPPORT_SSE2)
mal_pcm_u8_to_s24__sse2(dst, src, count, ditherMode);
#else
mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode);
#endif
#endif
}
......@@ -17370,6 +17402,24 @@ void mal_pcm_u8_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_d
mal_pcm_u8_to_s32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_u8_to_s32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_s32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX512)
void mal_pcm_u8_to_s32__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_s32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
void mal_pcm_u8_to_s32__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_s32__optimized(dst, src, count, ditherMode);
}
#endif
void mal_pcm_u8_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
......@@ -17409,6 +17459,24 @@ void mal_pcm_u8_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_d
mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_u8_to_f32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_u8_to_f32__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
void mal_pcm_u8_to_f32__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
}
#endif
void mal_pcm_u8_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
......@@ -17543,6 +17611,24 @@ void mal_pcm_s16_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_d
mal_pcm_s16_to_u8__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s16_to_u8__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_u8__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX512)
void mal_pcm_s16_to_u8__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_u8__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
void mal_pcm_s16_to_u8__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_u8__optimized(dst, src, count, ditherMode);
}
#endif
void mal_pcm_s16_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
......@@ -17587,6 +17673,24 @@ void mal_pcm_s16_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_s16_to_s24__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s16_to_s24__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_s24__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX512)
void mal_pcm_s16_to_s24__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_s24__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
void mal_pcm_s16_to_s24__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_s24__optimized(dst, src, count, ditherMode);
}
#endif
void mal_pcm_s16_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
......@@ -17622,6 +17726,24 @@ void mal_pcm_s16_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_s16_to_s32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s16_to_s32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_s32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX512)
void mal_pcm_s16_to_s32__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_s32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
void mal_pcm_s16_to_s32__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_s32__optimized(dst, src, count, ditherMode);
}
#endif
void mal_pcm_s16_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
......@@ -17669,6 +17791,24 @@ void mal_pcm_s16_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_s16_to_f32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s16_to_f32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_f32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX512)
void mal_pcm_s16_to_f32__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_f32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
void mal_pcm_s16_to_f32__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_f32__optimized(dst, src, count, ditherMode);
}
#endif
void mal_pcm_s16_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
......@@ -17781,6 +17921,24 @@ void mal_pcm_s24_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_d
mal_pcm_s24_to_u8__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s24_to_u8__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_u8__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX512)
void mal_pcm_s24_to_u8__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_u8__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
void mal_pcm_s24_to_u8__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_u8__optimized(dst, src, count, ditherMode);
}
#endif
void mal_pcm_s24_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
......@@ -17834,6 +17992,24 @@ void mal_pcm_s24_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_s24_to_s16__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s24_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_s16__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX512)
void mal_pcm_s24_to_s16__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_s16__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
void mal_pcm_s24_to_s16__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_s16__optimized(dst, src, count, ditherMode);
}
#endif
void mal_pcm_s24_to_s16(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
......@@ -17877,6 +18053,24 @@ void mal_pcm_s24_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_s24_to_s32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s24_to_s32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_s32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX512)
void mal_pcm_s24_to_s32__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_s32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
void mal_pcm_s24_to_s32__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_s32__optimized(dst, src, count, ditherMode);
}
#endif
void mal_pcm_s24_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
......@@ -17924,6 +18118,24 @@ void mal_pcm_s24_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_s24_to_f32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s24_to_f32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_f32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX512)
void mal_pcm_s24_to_f32__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_f32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
void mal_pcm_s24_to_f32__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_f32__optimized(dst, src, count, ditherMode);
}
#endif
void mal_pcm_s24_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
......@@ -18043,6 +18255,24 @@ void mal_pcm_s32_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_d
mal_pcm_s32_to_u8__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s32_to_u8__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_u8__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX512)
void mal_pcm_s32_to_u8__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_u8__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
void mal_pcm_s32_to_u8__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_u8__optimized(dst, src, count, ditherMode);
}
#endif
void mal_pcm_s32_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
......@@ -18096,6 +18326,24 @@ void mal_pcm_s32_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_s32_to_s16__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s32_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_s16__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX512)
void mal_pcm_s32_to_s16__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_s16__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
void mal_pcm_s32_to_s16__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_s16__optimized(dst, src, count, ditherMode);
}
#endif
void mal_pcm_s32_to_s16(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
......@@ -18134,6 +18382,24 @@ void mal_pcm_s32_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_s32_to_s24__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s32_to_s24__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_s24__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX512)
void mal_pcm_s32_to_s24__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_s24__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
void mal_pcm_s32_to_s24__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_s24__optimized(dst, src, count, ditherMode);
}
#endif
void mal_pcm_s32_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
......@@ -18187,6 +18453,24 @@ void mal_pcm_s32_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_s32_to_f32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s32_to_f32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_f32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX512)
void mal_pcm_s32_to_f32__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_f32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
void mal_pcm_s32_to_f32__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_f32__optimized(dst, src, count, ditherMode);
}
#endif
void mal_pcm_s32_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
......@@ -18292,6 +18576,24 @@ void mal_pcm_f32_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_d
mal_pcm_f32_to_u8__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_f32_to_u8__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_f32_to_u8__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX512)
void mal_pcm_f32_to_u8__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_f32_to_u8__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
void mal_pcm_f32_to_u8__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_f32_to_u8__optimized(dst, src, count, ditherMode);
}
#endif
void mal_pcm_f32_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
......@@ -18399,7 +18701,6 @@ void mal_pcm_f32_to_s16__optimized(void* dst, const void* src, mal_uint64 count,
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_f32_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
#if 1
mal_int16* dst_s16 = (mal_int16*)dst;
const float* src_f32 = (const float*)src;
......@@ -18457,7 +18758,7 @@ void mal_pcm_f32_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_
x0 = _mm_mul_ps(x0, _mm_set1_ps(32767.0f));
x1 = _mm_mul_ps(x1, _mm_set1_ps(32767.0f));
*((__m128i*)(dst_s16 + i)) = _mm_packs_epi32(_mm_cvtps_epi32(x0), _mm_cvtps_epi32(x1));
*((__m128i*)(dst_s16 + i)) = _mm_packs_epi32(_mm_cvttps_epi32(x0), _mm_cvttps_epi32(x1));
i += 8;
}
......@@ -18472,9 +18773,118 @@ void mal_pcm_f32_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_
dst_s16[i] = (mal_int16)x;
}
#else
mal_pcm_f32_to_s16__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_f32_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_int16* dst_s16 = (mal_int16*)dst;
const float* src_f32 = (const float*)src;
float ditherMin = 0;
float ditherMax = 0;
if (ditherMode != mal_dither_mode_none) {
ditherMin = 1.0f / -32768;
ditherMax = 1.0f / 32767;
}
mal_uint64 i = 0;
// AVX. AVX allows us to output 16 s16's at a time which means our loop is unrolled 16 times.
mal_uint64 count16 = count >> 4;
for (mal_uint64 i16 = 0; i16 < count16; i16 += 1) {
__m256 d0;
__m256 d1;
if (ditherMode == mal_dither_mode_none) {
d0 = _mm256_set1_ps(0);
d1 = _mm256_set1_ps(0);
} else if (ditherMode == mal_dither_mode_rectangle) {
d0 = _mm256_set_ps(
mal_dither_f32_rectangle(ditherMin, ditherMax),
mal_dither_f32_rectangle(ditherMin, ditherMax),
mal_dither_f32_rectangle(ditherMin, ditherMax),
mal_dither_f32_rectangle(ditherMin, ditherMax),
mal_dither_f32_rectangle(ditherMin, ditherMax),
mal_dither_f32_rectangle(ditherMin, ditherMax),
mal_dither_f32_rectangle(ditherMin, ditherMax),
mal_dither_f32_rectangle(ditherMin, ditherMax)
);
d1 = _mm256_set_ps(
mal_dither_f32_rectangle(ditherMin, ditherMax),
mal_dither_f32_rectangle(ditherMin, ditherMax),
mal_dither_f32_rectangle(ditherMin, ditherMax),
mal_dither_f32_rectangle(ditherMin, ditherMax),
mal_dither_f32_rectangle(ditherMin, ditherMax),
mal_dither_f32_rectangle(ditherMin, ditherMax),
mal_dither_f32_rectangle(ditherMin, ditherMax),
mal_dither_f32_rectangle(ditherMin, ditherMax)
);
} else {
d0 = _mm256_set_ps(
mal_dither_f32_triangle(ditherMin, ditherMax),
mal_dither_f32_triangle(ditherMin, ditherMax),
mal_dither_f32_triangle(ditherMin, ditherMax),
mal_dither_f32_triangle(ditherMin, ditherMax),
mal_dither_f32_triangle(ditherMin, ditherMax),
mal_dither_f32_triangle(ditherMin, ditherMax),
mal_dither_f32_triangle(ditherMin, ditherMax),
mal_dither_f32_triangle(ditherMin, ditherMax)
);
d1 = _mm256_set_ps(
mal_dither_f32_triangle(ditherMin, ditherMax),
mal_dither_f32_triangle(ditherMin, ditherMax),
mal_dither_f32_triangle(ditherMin, ditherMax),
mal_dither_f32_triangle(ditherMin, ditherMax),
mal_dither_f32_triangle(ditherMin, ditherMax),
mal_dither_f32_triangle(ditherMin, ditherMax),
mal_dither_f32_triangle(ditherMin, ditherMax),
mal_dither_f32_triangle(ditherMin, ditherMax)
);
}
__m256 x0 = *((__m256*)(src_f32 + i) + 0);
__m256 x1 = *((__m256*)(src_f32 + i) + 1);
x0 = _mm256_add_ps(x0, d0);
x1 = _mm256_add_ps(x1, d1);
x0 = _mm256_mul_ps(x0, _mm256_set1_ps(32767.0f));
x1 = _mm256_mul_ps(x1, _mm256_set1_ps(32767.0f));
// Computing the final result is a little more complicated for AVX than SSE.
__m256i i0 = _mm256_cvttps_epi32(x0);
__m256i i1 = _mm256_cvttps_epi32(x1);
__m256i p0 = _mm256_permute2x128_si256(i0, i1, 32);
__m256i p1 = _mm256_permute2x128_si256(i0, i1, 49);
__m256i r = _mm256_packs_epi32(p0, p1);
*((__m256i*)(dst_s16 + i)) = r;
i += 16;
}
// Leftover.
for (; i < count; i += 1) {
float x = src_f32[i];
x = x + mal_dither_f32(ditherMode, ditherMin, ditherMax);
x = ((x < -1) ? -1 : ((x > 1) ? 1 : x)); // clip
x = x * 32767.0f; // -1..1 to -32767..32767
dst_s16[i] = (mal_int16)x;
}
}
#endif
#if defined(MAL_SUPPORT_AVX512)
void mal_pcm_f32_to_s16__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
// TODO: Convert this from AVX to AVX-512.
mal_pcm_f32_to_s16__avx(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
void mal_pcm_f32_to_s16__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_f32_to_s16__optimized(dst, src, count, ditherMode);
}
#endif
......@@ -18528,6 +18938,24 @@ void mal_pcm_f32_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_f32_to_s24__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_f32_to_s24__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_f32_to_s24__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX512)
void mal_pcm_f32_to_s24__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_f32_to_s24__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
void mal_pcm_f32_to_s24__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_f32_to_s24__optimized(dst, src, count, ditherMode);
}
#endif
void mal_pcm_f32_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
......@@ -18576,6 +19004,24 @@ void mal_pcm_f32_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_f32_to_s32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_f32_to_s32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_f32_to_s32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX512)
void mal_pcm_f32_to_s32__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_f32_to_s32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
void mal_pcm_f32_to_s32__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_f32_to_s32__optimized(dst, src, count, ditherMode);
}
#endif
void mal_pcm_f32_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
......
......@@ -269,21 +269,210 @@ void pcm_convert__sse2(void* pOut, mal_format formatOut, const void* pIn, mal_fo
#if defined(MAL_SUPPORT_AVX)
void pcm_convert__avx(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode)
{
pcm_convert__sse2(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
switch (formatIn)
{
case mal_format_u8:
{
switch (formatOut)
{
case mal_format_s16: mal_pcm_u8_to_s16__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_u8_to_s24__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_u8_to_s32__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_u8_to_f32__avx(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_s16:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_s16_to_u8__avx( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_s16_to_s24__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_s16_to_s32__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s16_to_f32__avx(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_s24:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_s24_to_u8__avx( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_s24_to_s16__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_s24_to_s32__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s24_to_f32__avx(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_s32:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_s32_to_u8__avx( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_s32_to_s16__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_s32_to_s24__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s32_to_f32__avx(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_f32:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_f32_to_u8__avx( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_f32_to_s16__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_f32_to_s24__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_f32_to_s32__avx(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
default: break;
}
}
#endif
#if defined(MAL_SUPPORT_AVX512)
void pcm_convert__avx512(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode)
{
pcm_convert__avx(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
switch (formatIn)
{
case mal_format_u8:
{
switch (formatOut)
{
case mal_format_s16: mal_pcm_u8_to_s16__avx512(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_u8_to_s24__avx512(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_u8_to_s32__avx512(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_u8_to_f32__avx512(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_s16:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_s16_to_u8__avx512( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_s16_to_s24__avx512(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_s16_to_s32__avx512(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s16_to_f32__avx512(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_s24:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_s24_to_u8__avx512( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_s24_to_s16__avx512(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_s24_to_s32__avx512(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s24_to_f32__avx512(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_s32:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_s32_to_u8__avx512( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_s32_to_s16__avx512(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_s32_to_s24__avx512(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s32_to_f32__avx512(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_f32:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_f32_to_u8__avx512( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_f32_to_s16__avx512(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_f32_to_s24__avx512(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_f32_to_s32__avx512(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
default: break;
}
}
#endif
#if defined(MAL_SUPPORT_NEON)
void pcm_convert__neon(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode)
{
pcm_convert__reference(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
switch (formatIn)
{
case mal_format_u8:
{
switch (formatOut)
{
case mal_format_s16: mal_pcm_u8_to_s16__neon(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_u8_to_s24__neon(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_u8_to_s32__neon(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_u8_to_f32__neon(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_s16:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_s16_to_u8__neon( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_s16_to_s24__neon(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_s16_to_s32__neon(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s16_to_f32__neon(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_s24:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_s24_to_u8__neon( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_s24_to_s16__neon(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_s24_to_s32__neon(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s24_to_f32__neon(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_s32:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_s32_to_u8__neon( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_s32_to_s16__neon(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_s32_to_s24__neon(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s32_to_f32__neon(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_f32:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_f32_to_u8__neon( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_f32_to_s16__neon(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_f32_to_s24__neon(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_f32_to_s32__neon(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
default: break;
}
}
#endif
......@@ -359,7 +548,7 @@ int do_profiling__format_conversion__profile_individual(mal_format formatIn, mal
{
mal_int16 a = ((const mal_int16*)pReferenceData)[iSample];
mal_int16 b = ((const mal_int16*)pTestData)[iSample];
if (abs(a-b) > 1) {
if (abs(a-b) > 0) {
printf("Incorrect Sample: (%d) %d != %d\n", (int)iSample, a, b);
passed = MAL_FALSE;
}
......@@ -900,9 +1089,19 @@ int do_profiling__src()
// Converts two 4xf32 vectors to one 8xi16 vector with signed saturation.
static inline __m128i drmath_vf32_to_vi16__sse2(__m128 f32_0, __m128 f32_1)
__m128i drmath_vf32_to_vi16__sse2(__m128 f32_0, __m128 f32_1)
{
return _mm_packs_epi32(_mm_cvtps_epi32(f32_0), _mm_cvtps_epi32(f32_1));
return _mm_packs_epi32(_mm_cvttps_epi32(f32_0), _mm_cvttps_epi32(f32_1));
}
__m256i drmath_vf32_to_vi16__avx(__m256 f32_0, __m256 f32_1)
{
__m256i i0 = _mm256_cvttps_epi32(f32_0);
__m256i i1 = _mm256_cvttps_epi32(f32_1);
__m256i p0 = _mm256_permute2x128_si256(i0, i1, 32);
__m256i p1 = _mm256_permute2x128_si256(i0, i1, 49);
__m256i r = _mm256_packs_epi32(p0, p1);
return r;
}
int main(int argc, char** argv)
......@@ -916,7 +1115,11 @@ int main(int argc, char** argv)
//__m128 f1 = _mm_set_ps(-32780, 6, 5, 4);
//__m128i r = drmath_vf32_to_vi16__sse2(f0, f1);
//int a = 5;
__m256 f0 = _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0);
__m256 f1 = _mm256_set_ps(15, 14, 13, 12, 11, 10, 9, 8);
__m256i r = drmath_vf32_to_vi16__avx(f0, f1);
int a = 5;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment