Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
M
miniaudio
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Locked Files
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Security & Compliance
Security & Compliance
Dependency List
License Compliance
Packages
Packages
List
Container Registry
Analytics
Analytics
CI / CD
Code Review
Insights
Issues
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
MyCard
miniaudio
Commits
22d7b740
Commit
22d7b740
authored
May 26, 2018
by
David Reid
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Initial work on SSE2 optimizations for sample rate conversion.
parent
5dafa54f
Changes
2
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
430 additions
and
21 deletions
+430
-21
mini_al.h
mini_al.h
+130
-13
tests/mal_profiling.c
tests/mal_profiling.c
+300
-8
No files found.
mini_al.h
View file @
22d7b740
...
@@ -3211,8 +3211,19 @@ static MAL_INLINE float mal_mix_f32(float x, float y, float a)
...
@@ -3211,8 +3211,19 @@ static MAL_INLINE float mal_mix_f32(float x, float y, float a)
}
}
static
MAL_INLINE
float
mal_mix_f32_fast
(
float
x
,
float
y
,
float
a
)
static
MAL_INLINE
float
mal_mix_f32_fast
(
float
x
,
float
y
,
float
a
)
{
{
return x + (y - x)*a;
float
r0
=
(
y
-
x
);
float
r1
=
r0
*
a
;
return
x
+
r1
;
//return x + (y - x)*a;
}
#if defined(MAL_SUPPORT_SSE2)
static
MAL_INLINE
__m128
mal_mix_f32_fast__sse2
(
__m128
x
,
__m128
y
,
__m128
a
)
{
return
_mm_add_ps
(
x
,
_mm_mul_ps
(
_mm_sub_ps
(
y
,
x
),
a
));
}
}
#endif
static
MAL_INLINE
double
mal_mix_f64
(
double
x
,
double
y
,
double
a
)
static
MAL_INLINE
double
mal_mix_f64
(
double
x
,
double
y
,
double
a
)
{
{
...
@@ -3384,7 +3395,7 @@ void mal_timer_init(mal_timer* pTimer)
...
@@ -3384,7 +3395,7 @@ void mal_timer_init(mal_timer* pTimer)
LARGE_INTEGER
counter
;
LARGE_INTEGER
counter
;
QueryPerformanceCounter
(
&
counter
);
QueryPerformanceCounter
(
&
counter
);
pTimer->counter =
(mal_uint64)
counter.QuadPart;
pTimer
->
counter
=
counter
.
QuadPart
;
}
}
double
mal_timer_get_time_in_seconds
(
mal_timer
*
pTimer
)
double
mal_timer_get_time_in_seconds
(
mal_timer
*
pTimer
)
...
@@ -3394,7 +3405,7 @@ double mal_timer_get_time_in_seconds(mal_timer* pTimer)
...
@@ -3394,7 +3405,7 @@ double mal_timer_get_time_in_seconds(mal_timer* pTimer)
return
0
;
return
0
;
}
}
return (
counter.QuadPart - pTimer->counter) / (double)
g_mal_TimerFrequency.QuadPart;
return
(
double
)(
counter
.
QuadPart
-
pTimer
->
counter
)
/
g_mal_TimerFrequency
.
QuadPart
;
}
}
#elif defined(MAL_APPLE) && (__MAC_OS_X_VERSION_MIN_REQUIRED < 101200)
#elif defined(MAL_APPLE) && (__MAC_OS_X_VERSION_MIN_REQUIRED < 101200)
uint64_t
g_mal_TimerFrequency
=
0
;
uint64_t
g_mal_TimerFrequency
=
0
;
...
@@ -19677,7 +19688,7 @@ void mal_src__build_sinc_table__sinc(mal_src* pSRC)
...
@@ -19677,7 +19688,7 @@ void mal_src__build_sinc_table__sinc(mal_src* pSRC)
mal_assert
(
pSRC
!=
NULL
);
mal_assert
(
pSRC
!=
NULL
);
pSRC
->
sinc
.
table
[
0
]
=
1.0
f
;
pSRC
->
sinc
.
table
[
0
]
=
1.0
f
;
for (
int
i = 1; i < mal_countof(pSRC->sinc.table); i += 1) {
for
(
mal_uint32
i
=
1
;
i
<
mal_countof
(
pSRC
->
sinc
.
table
);
i
+=
1
)
{
double
x
=
i
*
MAL_PI_D
/
MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION
;
double
x
=
i
*
MAL_PI_D
/
MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION
;
pSRC
->
sinc
.
table
[
i
]
=
(
float
)(
sin
(
x
)
/
x
);
pSRC
->
sinc
.
table
[
i
]
=
(
float
)(
sin
(
x
)
/
x
);
}
}
...
@@ -19693,7 +19704,7 @@ void mal_src__build_sinc_table__hann(mal_src* pSRC)
...
@@ -19693,7 +19704,7 @@ void mal_src__build_sinc_table__hann(mal_src* pSRC)
{
{
mal_src__build_sinc_table__sinc
(
pSRC
);
mal_src__build_sinc_table__sinc
(
pSRC
);
for (
int
i = 0; i < mal_countof(pSRC->sinc.table); i += 1) {
for
(
mal_uint32
i
=
0
;
i
<
mal_countof
(
pSRC
->
sinc
.
table
);
i
+=
1
)
{
double
x
=
pSRC
->
sinc
.
table
[
i
];
double
x
=
pSRC
->
sinc
.
table
[
i
];
double
N
=
MAL_SRC_SINC_MAX_WINDOW_WIDTH
*
2
;
double
N
=
MAL_SRC_SINC_MAX_WINDOW_WIDTH
*
2
;
double
n
=
((
double
)(
i
)
/
MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION
)
+
MAL_SRC_SINC_MAX_WINDOW_WIDTH
;
double
n
=
((
double
)(
i
)
/
MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION
)
+
MAL_SRC_SINC_MAX_WINDOW_WIDTH
;
...
@@ -20070,7 +20081,7 @@ static MAL_INLINE float mal_src_sinc__interpolation_factor(const mal_src* pSRC,
...
@@ -20070,7 +20081,7 @@ static MAL_INLINE float mal_src_sinc__interpolation_factor(const mal_src* pSRC,
float
xabs
=
(
float
)
fabs
(
x
);
float
xabs
=
(
float
)
fabs
(
x
);
if
(
xabs
>=
MAL_SRC_SINC_MAX_WINDOW_WIDTH
/*pSRC->config.sinc.windowWidth*/
)
{
if
(
xabs
>=
MAL_SRC_SINC_MAX_WINDOW_WIDTH
/*pSRC->config.sinc.windowWidth*/
)
{
return 0;
xabs
=
1
;
// <-- A non-zero integer will always return 0.
}
}
xabs
=
xabs
*
MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION
;
xabs
=
xabs
*
MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION
;
...
@@ -20084,6 +20095,60 @@ static MAL_INLINE float mal_src_sinc__interpolation_factor(const mal_src* pSRC,
...
@@ -20084,6 +20095,60 @@ static MAL_INLINE float mal_src_sinc__interpolation_factor(const mal_src* pSRC,
#endif
#endif
}
}
#if defined(MAL_SUPPORT_SSE2)
static
MAL_INLINE
__m128
mal_fabsf_sse2
(
__m128
x
)
{
static
MAL_ALIGN
(
16
)
mal_uint32
mask
[
4
]
=
{
0x7FFFFFFF
,
0x7FFFFFFF
,
0x7FFFFFFF
,
0x7FFFFFFF
};
return
_mm_and_ps
(
*
(
__m128
*
)
mask
,
x
);
}
static
MAL_INLINE
__m128
mal_truncf_sse2
(
__m128
x
)
{
return
_mm_cvtepi32_ps
(
_mm_cvttps_epi32
(
x
));
}
static
MAL_INLINE
__m128
mal_src_sinc__interpolation_factor__sse2
(
const
mal_src
*
pSRC
,
__m128
*
x
)
{
__m128
windowWidth128
=
_mm_set1_ps
(
MAL_SRC_SINC_MAX_WINDOW_WIDTH
);
__m128
resolution128
=
_mm_set1_ps
(
MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION
);
__m128
one
=
_mm_set1_ps
(
1
);
__m128
xabs
=
mal_fabsf_sse2
(
*
x
);
// if (MAL_SRC_SINC_MAX_WINDOW_WIDTH <= xabs) xabs = 1 else xabs = xabs;
__m128
xcmp
=
_mm_cmp_ps
(
windowWidth128
,
xabs
,
2
);
// 2 = Less than or equal = _mm_cmple_ps.
xabs
=
_mm_or_ps
(
_mm_and_ps
(
one
,
xcmp
),
_mm_andnot_ps
(
xcmp
,
xabs
));
// xabs = (xcmp) ? 1 : xabs;
xabs
=
_mm_mul_ps
(
xabs
,
resolution128
);
__m128i
ixabs
=
_mm_cvttps_epi32
(
xabs
);
__m128
lo
=
_mm_set_ps
(
pSRC
->
sinc
.
table
[((
int
*
)
&
ixabs
)[
3
]],
pSRC
->
sinc
.
table
[((
int
*
)
&
ixabs
)[
2
]],
pSRC
->
sinc
.
table
[((
int
*
)
&
ixabs
)[
1
]],
pSRC
->
sinc
.
table
[((
int
*
)
&
ixabs
)[
0
]]
);
__m128
hi
=
_mm_set_ps
(
pSRC
->
sinc
.
table
[((
int
*
)
&
ixabs
)[
3
]
+
1
],
pSRC
->
sinc
.
table
[((
int
*
)
&
ixabs
)[
2
]
+
1
],
pSRC
->
sinc
.
table
[((
int
*
)
&
ixabs
)[
1
]
+
1
],
pSRC
->
sinc
.
table
[((
int
*
)
&
ixabs
)[
0
]
+
1
]
);
__m128
a
=
_mm_sub_ps
(
xabs
,
_mm_cvtepi32_ps
(
ixabs
));
__m128
r
=
mal_mix_f32_fast__sse2
(
lo
,
hi
,
a
);
return
r
;
}
#endif
mal_uint64
mal_src_read_deinterleaved__sinc
(
mal_src
*
pSRC
,
mal_uint64
frameCount
,
void
**
ppSamplesOut
,
void
*
pUserData
)
mal_uint64
mal_src_read_deinterleaved__sinc
(
mal_src
*
pSRC
,
mal_uint64
frameCount
,
void
**
ppSamplesOut
,
void
*
pUserData
)
{
{
mal_assert
(
pSRC
!=
NULL
);
mal_assert
(
pSRC
!=
NULL
);
...
@@ -20122,21 +20187,66 @@ mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount
...
@@ -20122,21 +20187,66 @@ mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount
outputFramesToRead
=
maxOutputFramesToRead
;
outputFramesToRead
=
maxOutputFramesToRead
;
}
}
float
_windowSamplesUnaligned
[
MAL_SRC_SINC_MAX_WINDOW_WIDTH
*
2
+
MAL_SIMD_ALIGNMENT
];
float
*
windowSamples
=
(
float
*
)(((
mal_uintptr
)
_windowSamplesUnaligned
+
MAL_SIMD_ALIGNMENT
-
1
)
&
~
(
MAL_SIMD_ALIGNMENT
-
1
));
float
_iWindowFUnaligned
[
MAL_SRC_SINC_MAX_WINDOW_WIDTH
*
2
+
MAL_SIMD_ALIGNMENT
];
float
*
iWindowF
=
(
float
*
)(((
mal_uintptr
)
_iWindowFUnaligned
+
MAL_SIMD_ALIGNMENT
-
1
)
&
~
(
MAL_SIMD_ALIGNMENT
-
1
));
for
(
mal_int32
i
=
0
;
i
<
windowWidth2
;
++
i
)
{
iWindowF
[
i
]
=
(
float
)(
i
-
windowWidth
);
}
for
(
mal_uint32
iChannel
=
0
;
iChannel
<
pSRC
->
config
.
channels
;
iChannel
+=
1
)
{
for
(
mal_uint32
iChannel
=
0
;
iChannel
<
pSRC
->
config
.
channels
;
iChannel
+=
1
)
{
// Do SRC.
// Do SRC.
float
timeIn
=
timeInBeg
;
float
timeIn
=
timeInBeg
;
for
(
mal_uint32
iSample
=
0
;
iSample
<
outputFramesToRead
;
iSample
+=
1
)
{
for
(
mal_uint32
iSample
=
0
;
iSample
<
outputFramesToRead
;
iSample
+=
1
)
{
mal_int32 iTimeIn = (mal_int32)timeIn;
float
sampleOut
=
0
;
float
iTimeInF
=
mal_floorf
(
timeIn
);
mal_uint32
iTimeIn
=
(
mal_uint32
)
iTimeInF
;
//mal_int32 iWindowBeg = -windowWidth+1;
//mal_int32 iWindowEnd = windowWidth;
mal_int32
iWindow
=
0
;
// Pre-load the window samples into an aligned buffer to begin with. Need to put these into an aligned buffer to make SIMD easier.
windowSamples
[
0
]
=
0
;
// <-- The first sample is always zero.
for
(
mal_int32
i
=
1
;
i
<
windowWidth2
;
++
i
)
{
windowSamples
[
i
]
=
mal_src_sinc__get_input_sample_from_window
(
pSRC
,
iChannel
,
iTimeIn
,
i
-
windowWidth
);
}
float sampleOut = 0;
#if defined(MAL_SUPPORT_SSE2)
for (mal_int32 iWindow = -windowWidth+1; iWindow < windowWidth; iWindow += 1) {
if
(
pSRC
->
useSSE2
)
{
float t = (timeIn - iTimeIn);
__m128
t
=
_mm_set1_ps
((
timeIn
-
iTimeInF
));
float w = (float)(iWindow);
mal_int32
windowWidth4
=
windowWidth2
>>
2
;
for
(
mal_int32
iWindow4
=
0
;
iWindow4
<
windowWidth4
;
iWindow4
+=
1
)
{
__m128
*
s
=
(
__m128
*
)
windowSamples
+
iWindow4
;
__m128
*
w
=
(
__m128
*
)
iWindowF
+
iWindow4
;
__m128
x
=
_mm_sub_ps
(
t
,
*
w
);
__m128
a
=
mal_src_sinc__interpolation_factor__sse2
(
pSRC
,
&
x
);
__m128
r
=
_mm_mul_ps
(
*
s
,
a
);
sampleOut
+=
((
float
*
)(
&
r
))[
0
];
sampleOut
+=
((
float
*
)(
&
r
))[
1
];
sampleOut
+=
((
float
*
)(
&
r
))[
2
];
sampleOut
+=
((
float
*
)(
&
r
))[
3
];
}
iWindow
+=
windowWidth4
*
4
;
}
#endif
// Non-SIMD/Reference implementation.
for
(;
iWindow
<
windowWidth2
;
iWindow
+=
1
)
{
float
s
=
windowSamples
[
iWindow
];
float
t
=
(
timeIn
-
iTimeIn
);
float
w
=
iWindowF
[
iWindow
];
float
a
=
mal_src_sinc__interpolation_factor
(
pSRC
,
(
t
-
w
));
float
a
=
mal_src_sinc__interpolation_factor
(
pSRC
,
(
t
-
w
));
float
s = mal_src_sinc__get_input_sample_from_window(pSRC, iChannel, iTimeIn, iWindow)
;
float
r
=
s
*
a
;
sampleOut +=
s * a
;
sampleOut
+=
r
;
}
}
ppNextSamplesOut
[
iChannel
][
iSample
]
=
(
float
)
sampleOut
;
ppNextSamplesOut
[
iChannel
][
iSample
]
=
(
float
)
sampleOut
;
...
@@ -21902,6 +22012,13 @@ mal_result mal_decoder_init__internal(mal_decoder_read_proc onRead, mal_decoder_
...
@@ -21902,6 +22012,13 @@ mal_result mal_decoder_init__internal(mal_decoder_read_proc onRead, mal_decoder_
mal_assert
(
pConfig
!=
NULL
);
mal_assert
(
pConfig
!=
NULL
);
mal_assert
(
pDecoder
!=
NULL
);
mal_assert
(
pDecoder
!=
NULL
);
// Silence some warnings in the case that we don't have any decoder backends enabled.
(
void
)
onRead
;
(
void
)
onSeek
;
(
void
)
pUserData
;
(
void
)
pConfig
;
(
void
)
pDecoder
;
// We use trial and error to open a decoder.
// We use trial and error to open a decoder.
mal_result
result
=
MAL_NO_BACKEND
;
mal_result
result
=
MAL_NO_BACKEND
;
...
...
tests/mal_profiling.c
View file @
22d7b740
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment