Commit b2815ccf authored by David Reid's avatar David Reid

Add SSE2 optimized implementation of f32 -> s16 conversion.

parent 073e89e4
......@@ -3306,16 +3306,25 @@ static MAL_INLINE mal_int32 mal_rand_range_s32(mal_int32 lo, mal_int32 hi)
}
static MAL_INLINE float mal_dither_f32_rectangle(float ditherMin, float ditherMax)
{
return mal_rand_range_f32(ditherMin, ditherMax);
}
static MAL_INLINE float mal_dither_f32_triangle(float ditherMin, float ditherMax)
{
float a = mal_rand_range_f32(ditherMin, 0);
float b = mal_rand_range_f32(0, ditherMax);
return a + b;
}
static MAL_INLINE float mal_dither_f32(mal_dither_mode ditherMode, float ditherMin, float ditherMax)
{
if (ditherMode == mal_dither_mode_rectangle) {
float a = mal_rand_range_f32(ditherMin, ditherMax);
return a;
return mal_dither_f32_rectangle(ditherMin, ditherMax);
}
if (ditherMode == mal_dither_mode_triangle) {
float a = mal_rand_range_f32(ditherMin, 0);
float b = mal_rand_range_f32(0, ditherMax);
return a + b;
return mal_dither_f32_triangle(ditherMin, ditherMax);
}
return 0;
......@@ -17273,8 +17282,8 @@ void mal_pcm_u8_to_s16__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_u8_to_s16__reference(dst, src, count, ditherMode);
}
#ifdef MAL_USE_SSE
void mal_pcm_u8_to_s16__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_u8_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_s16__optimized(dst, src, count, ditherMode);
}
......@@ -17284,13 +17293,9 @@ void mal_pcm_u8_to_s16(void* dst, const void* src, mal_uint64 count, mal_dither_
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_u8_to_s16__reference(dst, src, count, ditherMode);
#else
#ifdef MAL_USE_SSE
mal_pcm_u8_to_s16__sse(dst, src, count, ditherMode);
#else
mal_pcm_u8_to_s16__optimized(dst, src, count, ditherMode);
#endif
#endif
}
......@@ -17317,8 +17322,8 @@ void mal_pcm_u8_to_s24__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_u8_to_s24__reference(dst, src, count, ditherMode);
}
#ifdef MAL_USE_SSE
void mal_pcm_u8_to_s24__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_u8_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode);
}
......@@ -17329,8 +17334,8 @@ void mal_pcm_u8_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither_
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_u8_to_s24__reference(dst, src, count, ditherMode);
#else
#ifdef MAL_USE_SSE
mal_pcm_u8_to_s24__sse(dst, src, count, ditherMode);
#if defined(MAL_SUPPORT_SSE2)
mal_pcm_u8_to_s24__sse2(dst, src, count, ditherMode);
#else
mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode);
#endif
......@@ -17359,8 +17364,8 @@ void mal_pcm_u8_to_s32__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_u8_to_s32__reference(dst, src, count, ditherMode);
}
#ifdef MAL_USE_SSE
void mal_pcm_u8_to_s32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_u8_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_s32__optimized(dst, src, count, ditherMode);
}
......@@ -17370,13 +17375,9 @@ void mal_pcm_u8_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither_
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_u8_to_s32__reference(dst, src, count, ditherMode);
#else
#ifdef MAL_USE_SSE
mal_pcm_u8_to_s32__sse(dst, src, count, ditherMode);
#else
mal_pcm_u8_to_s32__optimized(dst, src, count, ditherMode);
#endif
#endif
}
......@@ -17402,8 +17403,8 @@ void mal_pcm_u8_to_f32__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_u8_to_f32__reference(dst, src, count, ditherMode);
}
#ifdef MAL_USE_SSE
void mal_pcm_u8_to_f32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_u8_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
}
......@@ -17413,13 +17414,9 @@ void mal_pcm_u8_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither_
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_u8_to_f32__reference(dst, src, count, ditherMode);
#else
#ifdef MAL_USE_SSE
mal_pcm_u8_to_f32__sse(dst, src, count, ditherMode);
#else
mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
#endif
#endif
}
......@@ -17540,8 +17537,8 @@ void mal_pcm_s16_to_u8__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s16_to_u8__reference(dst, src, count, ditherMode);
}
#ifdef MAL_USE_SSE
void mal_pcm_s16_to_u8__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_s16_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_u8__optimized(dst, src, count, ditherMode);
}
......@@ -17551,13 +17548,9 @@ void mal_pcm_s16_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s16_to_u8__reference(dst, src, count, ditherMode);
#else
#ifdef MAL_USE_SSE
mal_pcm_s16_to_u8__sse(dst, src, count, ditherMode);
#else
mal_pcm_s16_to_u8__optimized(dst, src, count, ditherMode);
#endif
#endif
}
......@@ -17588,8 +17581,8 @@ void mal_pcm_s16_to_s24__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s16_to_s24__reference(dst, src, count, ditherMode);
}
#ifdef MAL_USE_SSE
void mal_pcm_s16_to_s24__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_s16_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_s24__optimized(dst, src, count, ditherMode);
}
......@@ -17599,13 +17592,9 @@ void mal_pcm_s16_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s16_to_s24__reference(dst, src, count, ditherMode);
#else
#ifdef MAL_USE_SSE
mal_pcm_s16_to_s24__sse(dst, src, count, ditherMode);
#else
mal_pcm_s16_to_s24__optimized(dst, src, count, ditherMode);
#endif
#endif
}
......@@ -17627,8 +17616,8 @@ void mal_pcm_s16_to_s32__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s16_to_s32__reference(dst, src, count, ditherMode);
}
#ifdef MAL_USE_SSE
void mal_pcm_s16_to_s32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_s16_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_s32__optimized(dst, src, count, ditherMode);
}
......@@ -17638,13 +17627,9 @@ void mal_pcm_s16_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s16_to_s32__reference(dst, src, count, ditherMode);
#else
#ifdef MAL_USE_SSE
mal_pcm_s16_to_s32__sse(dst, src, count, ditherMode);
#else
mal_pcm_s16_to_s32__optimized(dst, src, count, ditherMode);
#endif
#endif
}
......@@ -17678,8 +17663,8 @@ void mal_pcm_s16_to_f32__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s16_to_f32__reference(dst, src, count, ditherMode);
}
#ifdef MAL_USE_SSE
void mal_pcm_s16_to_f32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_s16_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_f32__optimized(dst, src, count, ditherMode);
}
......@@ -17689,13 +17674,9 @@ void mal_pcm_s16_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s16_to_f32__reference(dst, src, count, ditherMode);
#else
#ifdef MAL_USE_SSE
mal_pcm_s16_to_f32__sse(dst, src, count, ditherMode);
#else
mal_pcm_s16_to_f32__optimized(dst, src, count, ditherMode);
#endif
#endif
}
......@@ -17794,8 +17775,8 @@ void mal_pcm_s24_to_u8__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s24_to_u8__reference(dst, src, count, ditherMode);
}
#ifdef MAL_USE_SSE
void mal_pcm_s24_to_u8__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_s24_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_u8__optimized(dst, src, count, ditherMode);
}
......@@ -17805,13 +17786,9 @@ void mal_pcm_s24_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s24_to_u8__reference(dst, src, count, ditherMode);
#else
#ifdef MAL_USE_SSE
mal_pcm_s24_to_u8__sse(dst, src, count, ditherMode);
#else
mal_pcm_s24_to_u8__optimized(dst, src, count, ditherMode);
#endif
#endif
}
......@@ -17851,8 +17828,8 @@ void mal_pcm_s24_to_s16__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s24_to_s16__reference(dst, src, count, ditherMode);
}
#ifdef MAL_USE_SSE
void mal_pcm_s24_to_s16__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_s24_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_s16__optimized(dst, src, count, ditherMode);
}
......@@ -17862,13 +17839,9 @@ void mal_pcm_s24_to_s16(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s24_to_s16__reference(dst, src, count, ditherMode);
#else
#ifdef MAL_USE_SSE
mal_pcm_s24_to_s16__sse(dst, src, count, ditherMode);
#else
mal_pcm_s24_to_s16__optimized(dst, src, count, ditherMode);
#endif
#endif
}
......@@ -17898,8 +17871,8 @@ void mal_pcm_s24_to_s32__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s24_to_s32__reference(dst, src, count, ditherMode);
}
#ifdef MAL_USE_SSE
void mal_pcm_s24_to_s32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_s24_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_s32__optimized(dst, src, count, ditherMode);
}
......@@ -17909,13 +17882,9 @@ void mal_pcm_s24_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s24_to_s32__reference(dst, src, count, ditherMode);
#else
#ifdef MAL_USE_SSE
mal_pcm_s24_to_s32__sse(dst, src, count, ditherMode);
#else
mal_pcm_s24_to_s32__optimized(dst, src, count, ditherMode);
#endif
#endif
}
......@@ -17949,8 +17918,8 @@ void mal_pcm_s24_to_f32__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s24_to_f32__reference(dst, src, count, ditherMode);
}
#ifdef MAL_USE_SSE
void mal_pcm_s24_to_f32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_s24_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_f32__optimized(dst, src, count, ditherMode);
}
......@@ -17960,13 +17929,9 @@ void mal_pcm_s24_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s24_to_f32__reference(dst, src, count, ditherMode);
#else
#ifdef MAL_USE_SSE
mal_pcm_s24_to_f32__sse(dst, src, count, ditherMode);
#else
mal_pcm_s24_to_f32__optimized(dst, src, count, ditherMode);
#endif
#endif
}
......@@ -18072,8 +18037,8 @@ void mal_pcm_s32_to_u8__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s32_to_u8__reference(dst, src, count, ditherMode);
}
#ifdef MAL_USE_SSE
void mal_pcm_s32_to_u8__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_s32_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_u8__optimized(dst, src, count, ditherMode);
}
......@@ -18083,13 +18048,9 @@ void mal_pcm_s32_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s32_to_u8__reference(dst, src, count, ditherMode);
#else
#ifdef MAL_USE_SSE
mal_pcm_s32_to_u8__sse(dst, src, count, ditherMode);
#else
mal_pcm_s32_to_u8__optimized(dst, src, count, ditherMode);
#endif
#endif
}
......@@ -18129,8 +18090,8 @@ void mal_pcm_s32_to_s16__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s32_to_s16__reference(dst, src, count, ditherMode);
}
#ifdef MAL_USE_SSE
void mal_pcm_s32_to_s16__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_s32_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_s16__optimized(dst, src, count, ditherMode);
}
......@@ -18140,13 +18101,9 @@ void mal_pcm_s32_to_s16(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s32_to_s16__reference(dst, src, count, ditherMode);
#else
#ifdef MAL_USE_SSE
mal_pcm_s32_to_s16__sse(dst, src, count, ditherMode);
#else
mal_pcm_s32_to_s16__optimized(dst, src, count, ditherMode);
#endif
#endif
}
......@@ -18171,8 +18128,8 @@ void mal_pcm_s32_to_s24__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s32_to_s24__reference(dst, src, count, ditherMode);
}
#ifdef MAL_USE_SSE
void mal_pcm_s32_to_s24__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_s32_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_s24__optimized(dst, src, count, ditherMode);
}
......@@ -18182,13 +18139,9 @@ void mal_pcm_s32_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s32_to_s24__reference(dst, src, count, ditherMode);
#else
#ifdef MAL_USE_SSE
mal_pcm_s32_to_s24__sse(dst, src, count, ditherMode);
#else
mal_pcm_s32_to_s24__optimized(dst, src, count, ditherMode);
#endif
#endif
}
......@@ -18228,8 +18181,8 @@ void mal_pcm_s32_to_f32__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s32_to_f32__reference(dst, src, count, ditherMode);
}
#ifdef MAL_USE_SSE
void mal_pcm_s32_to_f32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_s32_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_f32__optimized(dst, src, count, ditherMode);
}
......@@ -18239,13 +18192,9 @@ void mal_pcm_s32_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s32_to_f32__reference(dst, src, count, ditherMode);
#else
#ifdef MAL_USE_SSE
mal_pcm_s32_to_f32__sse(dst, src, count, ditherMode);
#else
mal_pcm_s32_to_f32__optimized(dst, src, count, ditherMode);
#endif
#endif
}
......@@ -18337,8 +18286,8 @@ void mal_pcm_f32_to_u8__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_f32_to_u8__reference(dst, src, count, ditherMode);
}
#ifdef MAL_USE_SSE
void mal_pcm_f32_to_u8__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_f32_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_f32_to_u8__optimized(dst, src, count, ditherMode);
}
......@@ -18348,13 +18297,9 @@ void mal_pcm_f32_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_f32_to_u8__reference(dst, src, count, ditherMode);
#else
#ifdef MAL_USE_SSE
mal_pcm_f32_to_u8__sse(dst, src, count, ditherMode);
#else
mal_pcm_f32_to_u8__optimized(dst, src, count, ditherMode);
#endif
#endif
}
......@@ -18392,13 +18337,144 @@ void mal_pcm_f32_to_s16__reference(void* dst, const void* src, mal_uint64 count,
void mal_pcm_f32_to_s16__optimized(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_f32_to_s16__reference(dst, src, count, ditherMode);
mal_int16* dst_s16 = (mal_int16*)dst;
const float* src_f32 = (const float*)src;
float ditherMin = 0;
float ditherMax = 0;
if (ditherMode != mal_dither_mode_none) {
ditherMin = 1.0f / -32768;
ditherMax = 1.0f / 32767;
}
mal_uint64 i = 0;
// Unrolled.
mal_uint64 count4 = count >> 2;
for (mal_uint64 i4 = 0; i4 < count4; i4 += 1) {
float d0 = mal_dither_f32(ditherMode, ditherMin, ditherMax);
float d1 = mal_dither_f32(ditherMode, ditherMin, ditherMax);
float d2 = mal_dither_f32(ditherMode, ditherMin, ditherMax);
float d3 = mal_dither_f32(ditherMode, ditherMin, ditherMax);
float x0 = src_f32[i+0];
float x1 = src_f32[i+1];
float x2 = src_f32[i+2];
float x3 = src_f32[i+3];
x0 = x0 + d0;
x1 = x1 + d1;
x2 = x2 + d2;
x3 = x3 + d3;
x0 = ((x0 < -1) ? -1 : ((x0 > 1) ? 1 : x0));
x1 = ((x1 < -1) ? -1 : ((x1 > 1) ? 1 : x1));
x2 = ((x2 < -1) ? -1 : ((x2 > 1) ? 1 : x2));
x3 = ((x3 < -1) ? -1 : ((x3 > 1) ? 1 : x3));
x0 = x0 * 32767.0f;
x1 = x1 * 32767.0f;
x2 = x2 * 32767.0f;
x3 = x3 * 32767.0f;
dst_s16[i+0] = (mal_int16)x0;
dst_s16[i+1] = (mal_int16)x1;
dst_s16[i+2] = (mal_int16)x2;
dst_s16[i+3] = (mal_int16)x3;
i += 4;
}
// Leftover.
for (; i < count; i += 1) {
float x = src_f32[i];
x = x + mal_dither_f32(ditherMode, ditherMin, ditherMax);
x = ((x < -1) ? -1 : ((x > 1) ? 1 : x)); // clip
x = x * 32767.0f; // -1..1 to -32767..32767
dst_s16[i] = (mal_int16)x;
}
}
#ifdef MAL_USE_SSE
void mal_pcm_f32_to_s16__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_f32_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
#if 1
mal_int16* dst_s16 = (mal_int16*)dst;
const float* src_f32 = (const float*)src;
float ditherMin = 0;
float ditherMax = 0;
if (ditherMode != mal_dither_mode_none) {
ditherMin = 1.0f / -32768;
ditherMax = 1.0f / 32767;
}
mal_uint64 i = 0;
// SSE2. SSE allows us to output 8 s16's at a time which means our loop is unrolled 8 times.
mal_uint64 count8 = count >> 3;
for (mal_uint64 i8 = 0; i8 < count8; i8 += 1) {
__m128 d0;
__m128 d1;
if (ditherMode == mal_dither_mode_none) {
d0 = _mm_set1_ps(0);
d1 = _mm_set1_ps(0);
} else if (ditherMode == mal_dither_mode_rectangle) {
d0 = _mm_set_ps(
mal_dither_f32_rectangle(ditherMin, ditherMax),
mal_dither_f32_rectangle(ditherMin, ditherMax),
mal_dither_f32_rectangle(ditherMin, ditherMax),
mal_dither_f32_rectangle(ditherMin, ditherMax)
);
d1 = _mm_set_ps(
mal_dither_f32_rectangle(ditherMin, ditherMax),
mal_dither_f32_rectangle(ditherMin, ditherMax),
mal_dither_f32_rectangle(ditherMin, ditherMax),
mal_dither_f32_rectangle(ditherMin, ditherMax)
);
} else {
d0 = _mm_set_ps(
mal_dither_f32_triangle(ditherMin, ditherMax),
mal_dither_f32_triangle(ditherMin, ditherMax),
mal_dither_f32_triangle(ditherMin, ditherMax),
mal_dither_f32_triangle(ditherMin, ditherMax)
);
d1 = _mm_set_ps(
mal_dither_f32_triangle(ditherMin, ditherMax),
mal_dither_f32_triangle(ditherMin, ditherMax),
mal_dither_f32_triangle(ditherMin, ditherMax),
mal_dither_f32_triangle(ditherMin, ditherMax)
);
}
__m128 x0 = *((__m128*)(src_f32 + i) + 0);
__m128 x1 = *((__m128*)(src_f32 + i) + 1);
x0 = _mm_add_ps(x0, d0);
x1 = _mm_add_ps(x1, d1);
x0 = _mm_mul_ps(x0, _mm_set1_ps(32767.0f));
x1 = _mm_mul_ps(x1, _mm_set1_ps(32767.0f));
*((__m128i*)(dst_s16 + i)) = _mm_packs_epi32(_mm_cvtps_epi32(x0), _mm_cvtps_epi32(x1));
i += 8;
}
// Leftover.
for (; i < count; i += 1) {
float x = src_f32[i];
x = x + mal_dither_f32(ditherMode, ditherMin, ditherMax);
x = ((x < -1) ? -1 : ((x > 1) ? 1 : x)); // clip
x = x * 32767.0f; // -1..1 to -32767..32767
dst_s16[i] = (mal_int16)x;
}
#else
mal_pcm_f32_to_s16__optimized(dst, src, count, ditherMode);
#endif
}
#endif
......@@ -18406,13 +18482,9 @@ void mal_pcm_f32_to_s16(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_f32_to_s16__reference(dst, src, count, ditherMode);
#else
#ifdef MAL_USE_SSE
mal_pcm_f32_to_s16__sse(dst, src, count, ditherMode);
#else
mal_pcm_f32_to_s16__optimized(dst, src, count, ditherMode);
#endif
#endif
}
......@@ -18450,8 +18522,8 @@ void mal_pcm_f32_to_s24__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_f32_to_s24__reference(dst, src, count, ditherMode);
}
#ifdef MAL_USE_SSE
void mal_pcm_f32_to_s24__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_f32_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_f32_to_s24__optimized(dst, src, count, ditherMode);
}
......@@ -18461,13 +18533,9 @@ void mal_pcm_f32_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_f32_to_s24__reference(dst, src, count, ditherMode);
#else
#ifdef MAL_USE_SSE
mal_pcm_f32_to_s24__sse(dst, src, count, ditherMode);
#else
mal_pcm_f32_to_s24__optimized(dst, src, count, ditherMode);
#endif
#endif
}
......@@ -18502,8 +18570,8 @@ void mal_pcm_f32_to_s32__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_f32_to_s32__reference(dst, src, count, ditherMode);
}
#ifdef MAL_USE_SSE
void mal_pcm_f32_to_s32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_SSE2)
void mal_pcm_f32_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_f32_to_s32__optimized(dst, src, count, ditherMode);
}
......@@ -18513,13 +18581,9 @@ void mal_pcm_f32_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_f32_to_s32__reference(dst, src, count, ditherMode);
#else
#ifdef MAL_USE_SSE
mal_pcm_f32_to_s32__sse(dst, src, count, ditherMode);
#else
mal_pcm_f32_to_s32__optimized(dst, src, count, ditherMode);
#endif
#endif
}
......
......@@ -34,6 +34,429 @@ const char* mal_src_algorithm_to_string(mal_src_algorithm algorithm)
return "Unknown";
}
const char* mal_dither_mode_to_string(mal_dither_mode ditherMode)
{
switch (ditherMode) {
case mal_dither_mode_none: return "None";
case mal_dither_mode_rectangle: return "Rectangle";
case mal_dither_mode_triangle: return "Triangle";
}
return "Unkown";
}
///////////////////////////////////////////////////////////////////////////////
//
// Format Conversion
//
///////////////////////////////////////////////////////////////////////////////
typedef struct
{
void* pBaseData;
mal_uint64 sampleCount;
mal_uint64 iNextSample;
} format_conversion_data;
void pcm_convert__reference(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode)
{
switch (formatIn)
{
case mal_format_u8:
{
switch (formatOut)
{
case mal_format_s16: mal_pcm_u8_to_s16__reference(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_u8_to_s24__reference(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_u8_to_s32__reference(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_u8_to_f32__reference(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_s16:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_s16_to_u8__reference( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_s16_to_s24__reference(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_s16_to_s32__reference(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s16_to_f32__reference(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_s24:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_s24_to_u8__reference( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_s24_to_s16__reference(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_s24_to_s32__reference(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s24_to_f32__reference(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_s32:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_s32_to_u8__reference( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_s32_to_s16__reference(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_s32_to_s24__reference(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s32_to_f32__reference(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_f32:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_f32_to_u8__reference( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_f32_to_s16__reference(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_f32_to_s24__reference(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_f32_to_s32__reference(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
default: break;
}
}
void pcm_convert__optimized(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode)
{
switch (formatIn)
{
case mal_format_u8:
{
switch (formatOut)
{
case mal_format_s16: mal_pcm_u8_to_s16__optimized(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_u8_to_s24__optimized(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_u8_to_s32__optimized(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_u8_to_f32__optimized(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_s16:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_s16_to_u8__optimized( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_s16_to_s24__optimized(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_s16_to_s32__optimized(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s16_to_f32__optimized(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_s24:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_s24_to_u8__optimized( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_s24_to_s16__optimized(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_s24_to_s32__optimized(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s24_to_f32__optimized(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_s32:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_s32_to_u8__optimized( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_s32_to_s16__optimized(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_s32_to_s24__optimized(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s32_to_f32__optimized(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_f32:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_f32_to_u8__optimized( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_f32_to_s16__optimized(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_f32_to_s24__optimized(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_f32_to_s32__optimized(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
default: break;
}
}
#if defined(MAL_SUPPORT_SSE2)
void pcm_convert__sse2(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode)
{
switch (formatIn)
{
case mal_format_u8:
{
switch (formatOut)
{
case mal_format_s16: mal_pcm_u8_to_s16__sse2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_u8_to_s24__sse2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_u8_to_s32__sse2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_u8_to_f32__sse2(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_s16:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_s16_to_u8__sse2( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_s16_to_s24__sse2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_s16_to_s32__sse2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s16_to_f32__sse2(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_s24:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_s24_to_u8__sse2( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_s24_to_s16__sse2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_s24_to_s32__sse2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s24_to_f32__sse2(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_s32:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_s32_to_u8__sse2( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_s32_to_s16__sse2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_s32_to_s24__sse2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s32_to_f32__sse2(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
case mal_format_f32:
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_f32_to_u8__sse2( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_f32_to_s16__sse2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_f32_to_s24__sse2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_f32_to_s32__sse2(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
default: break;
}
}
#endif
#if defined(MAL_SUPPORT_AVX)
void pcm_convert__avx(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode)
{
pcm_convert__sse2(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX512)
void pcm_convert__avx512(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode)
{
pcm_convert__avx(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
void pcm_convert__neon(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode)
{
pcm_convert__reference(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
}
#endif
void pcm_convert(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode, simd_mode mode)
{
// For testing, we always reset the seed for dithering so we can get consistent results for comparisons.
mal_seed(1234);
switch (mode)
{
case simd_mode_scalar:
{
pcm_convert__optimized(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
} break;
#if defined(MAL_SUPPORT_SSE2)
case simd_mode_sse2:
{
pcm_convert__sse2(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
} break;
#endif
#if defined(MAL_SUPPORT_AVX)
case simd_mode_avx:
{
pcm_convert__avx(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
} break;
#endif
#if defined(MAL_SUPPORT_AVX512)
case simd_mode_avx512:
{
pcm_convert__avx512(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
} break;
#endif
#if defined(MAL_SUPPORT_NEON)
case simd_mode_neon:
{
pcm_convert__neon(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
} break;
#endif
}
}
int do_profiling__format_conversion__profile_individual(mal_format formatIn, mal_format formatOut, mal_dither_mode ditherMode, const void* pBaseData, mal_uint64 sampleCount, simd_mode mode, const void* pReferenceData, double referenceTime)
{
void* pTestData = mal_aligned_malloc((size_t)(sampleCount * mal_get_bytes_per_sample(formatOut)), MAL_SIMD_ALIGNMENT);
if (pTestData == NULL) {
printf("Out of memory.\n");
return -1;
}
mal_timer timer;
mal_timer_init(&timer);
double timeTaken = mal_timer_get_time_in_seconds(&timer);
{
pcm_convert(pTestData, formatOut, pBaseData, formatIn, sampleCount, ditherMode, mode);
}
timeTaken = mal_timer_get_time_in_seconds(&timer) - timeTaken;
// Compare with the reference for correctness.
mal_bool32 passed = MAL_TRUE;
for (mal_uint64 iSample = 0; iSample < sampleCount; ++iSample) {
mal_uint32 bps = mal_get_bytes_per_sample(formatOut);
// We need to compare on a format by format basis because we allow for very slight deviations in results depending on the output format.
switch (formatOut)
{
case mal_format_s16:
{
mal_int16 a = ((const mal_int16*)pReferenceData)[iSample];
mal_int16 b = ((const mal_int16*)pTestData)[iSample];
if (abs(a-b) > 1) {
printf("Incorrect Sample: (%d) %d != %d\n", (int)iSample, a, b);
passed = MAL_FALSE;
}
} break;
default:
{
if (memcmp(mal_offset_ptr(pReferenceData, iSample*bps), mal_offset_ptr(pTestData, iSample*bps), bps) != 0) {
printf("Incorrect Sample: (%d)\n", (int)iSample);
passed = MAL_FALSE;
}
} break;
}
}
if (passed) {
printf(" [PASSED] ");
} else {
printf(" [FAILED] ");
}
printf("(Dither = %s) %s -> %s (%s): %.4fms (%.2f%%)\n", mal_dither_mode_to_string(ditherMode), mal_get_format_name(formatIn), mal_get_format_name(formatOut), simd_mode_to_string(mode), timeTaken*1000, referenceTime/timeTaken*100);
mal_aligned_free(pTestData);
return 0;
}
int do_profiling__format_conversion__profile_set(mal_format formatIn, mal_format formatOut, mal_dither_mode ditherMode)
{
// Generate our base data to begin with. This is generated from an f32 sine wave which is converted to formatIn. That then becomes our base data.
mal_uint32 sampleCount = 1000000;
float* pSourceData = (float*)mal_aligned_malloc(sampleCount*sizeof(*pSourceData), MAL_SIMD_ALIGNMENT);
if (pSourceData == NULL) {
printf("Out of memory.\n");
return -1;
}
mal_sine_wave sineWave;
mal_sine_wave_init(1.0, 400, 48000, &sineWave);
mal_sine_wave_read(&sineWave, sampleCount, pSourceData);
void* pBaseData = mal_aligned_malloc(sampleCount * mal_get_bytes_per_sample(formatIn), MAL_SIMD_ALIGNMENT);
mal_pcm_convert(pBaseData, formatIn, pSourceData, mal_format_f32, sampleCount, mal_dither_mode_none);
// Reference first so we can get a benchmark.
void* pReferenceData = mal_aligned_malloc(sampleCount * mal_get_bytes_per_sample(formatOut), MAL_SIMD_ALIGNMENT);
mal_timer timer;
mal_timer_init(&timer);
double referenceTime = mal_timer_get_time_in_seconds(&timer);
{
pcm_convert__reference(pReferenceData, formatOut, pBaseData, formatIn, sampleCount, ditherMode);
}
referenceTime = mal_timer_get_time_in_seconds(&timer) - referenceTime;
// Here is where each optimized implementation is profiled.
do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_scalar, pReferenceData, referenceTime);
if (mal_has_sse2()) {
do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_sse2, pReferenceData, referenceTime);
}
if (mal_has_avx()) {
do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_avx, pReferenceData, referenceTime);
}
if (mal_has_avx512f()) {
do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_avx512, pReferenceData, referenceTime);
}
if (mal_has_neon()) {
do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_neon, pReferenceData, referenceTime);
}
mal_aligned_free(pReferenceData);
mal_aligned_free(pBaseData);
mal_aligned_free(pSourceData);
return 0;
}
int do_profiling__format_conversion()
{
// First we need to generate our base data.
do_profiling__format_conversion__profile_set(mal_format_f32, mal_format_s16, mal_dither_mode_none);
return 0;
}
///////////////////////////////////////////////////////////////////////////////
//
// Channel Routing
//
///////////////////////////////////////////////////////////////////////////////
float g_ChannelRouterProfilingOutputBenchmark[8][48000];
float g_ChannelRouterProfilingOutput[8][48000];
......@@ -416,6 +839,7 @@ int do_profiling__src__profile_set(src_data* pBaseData, mal_uint32 sampleRateIn,
// Now that we have the reference data to compare against we can go ahead and measure the SIMD optimizations.
do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_scalar, &referenceData);
if (mal_has_sse2()) {
do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_sse2, &referenceData);
}
......@@ -446,7 +870,7 @@ int do_profiling__src()
src_data baseData;
mal_zero_object(&baseData);
baseData.channels = 8;
baseData.frameCount = 10000;
baseData.frameCount = 100000;
for (mal_uint32 iChannel = 0; iChannel < baseData.channels; ++iChannel) {
baseData.pFrameData[iChannel] = (float*)mal_aligned_malloc((size_t)(baseData.frameCount * sizeof(float)), MAL_SIMD_ALIGNMENT);
if (baseData.pFrameData[iChannel] == NULL) {
......@@ -475,16 +899,33 @@ int do_profiling__src()
}
// Converts two 4xf32 vectors to one 8xi16 vector with signed saturation.
static inline __m128i drmath_vf32_to_vi16__sse2(__m128 f32_0, __m128 f32_1)
{
return _mm_packs_epi32(_mm_cvtps_epi32(f32_0), _mm_cvtps_epi32(f32_1));
}
int main(int argc, char** argv)
{
(void)argc;
(void)argv;
{
//__m128 f0 = _mm_set_ps(32780, 2, 1, 0);
//__m128 f1 = _mm_set_ps(-32780, 6, 5, 4);
//__m128i r = drmath_vf32_to_vi16__sse2(f0, f1);
//int a = 5;
}
// Summary.
if (mal_has_sse2()) {
printf("Has SSE: YES\n");
printf("Has SSE2: YES\n");
} else {
printf("Has SSE: NO\n");
printf("Has SSE2: NO\n");
}
if (mal_has_avx()) {
printf("Has AVX: YES\n");
......@@ -505,6 +946,10 @@ int main(int argc, char** argv)
printf("\n");
// Format conversion.
do_profiling__format_conversion();
printf("\n\n");
// Channel routing.
do_profiling__channel_routing();
printf("\n\n");
......
......@@ -141,7 +141,7 @@
<AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
<CompileAs>Default</CompileAs>
<EnableEnhancedInstructionSet>NoExtensions</EnableEnhancedInstructionSet>
<EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
......@@ -162,7 +162,7 @@
<AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
<CompileAs>Default</CompileAs>
<EnableEnhancedInstructionSet>NoExtensions</EnableEnhancedInstructionSet>
<EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
......@@ -183,6 +183,7 @@
<AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
<CompileAs>Default</CompileAs>
<EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
......@@ -202,7 +203,7 @@
<SDLCheck>true</SDLCheck>
<AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<CompileAs>Default</CompileAs>
<EnableEnhancedInstructionSet>NoExtensions</EnableEnhancedInstructionSet>
<EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
......@@ -226,7 +227,7 @@
<SDLCheck>true</SDLCheck>
<AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<CompileAs>Default</CompileAs>
<EnableEnhancedInstructionSet>NoExtensions</EnableEnhancedInstructionSet>
<EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
......@@ -250,6 +251,7 @@
<SDLCheck>true</SDLCheck>
<AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<CompileAs>Default</CompileAs>
<EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
......@@ -269,21 +271,21 @@
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="mal_profiling.c">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="mal_test_0.c">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">false</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">false</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">false</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">false</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="mal_test_0.c">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="mal_test_0.cpp">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</ExcludedFromBuild>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment