audio: converting int32 to/from float shouldn't use doubles.
The concern is that a massive int sample, like 0x7FFFFFFF, won't fit in a float32, which doesn't have enough bits to hold a whole number this large, just to divide it to get a value between 0 and 1. Previously we would convert to double, to get more bits, do the division, and cast back to a float, but this is expensive. Casting to double is more accurate, but it's 2x to 3x slower. Shifting out the least significant byte of an int32, so it'll definitely fit in a float, and dividing by 0x7FFFFF is still accurate to about 5 decimal places, and the difference doesn't appear to be perceptable.
parent
4df859c586
commit
e2ec1eb12e
|
@ -62,7 +62,7 @@ SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL;
|
||||||
|
|
||||||
#define DIVBY128 0.0078125f
|
#define DIVBY128 0.0078125f
|
||||||
#define DIVBY32768 0.000030517578125f
|
#define DIVBY32768 0.000030517578125f
|
||||||
#define DIVBY2147483648 0.00000000046566128730773926
|
#define DIVBY8388607 0.00000011920930376163766f
|
||||||
|
|
||||||
|
|
||||||
#if NEED_SCALAR_CONVERTER_FALLBACKS
|
#if NEED_SCALAR_CONVERTER_FALLBACKS
|
||||||
|
@ -152,7 +152,7 @@ SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
|
||||||
LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32");
|
LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32");
|
||||||
|
|
||||||
for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) {
|
for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) {
|
||||||
*dst = (float) (((double) *src) * DIVBY2147483648);
|
*dst = ((float) (*src>>8)) * DIVBY8388607;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cvt->filters[++cvt->filter_index]) {
|
if (cvt->filters[++cvt->filter_index]) {
|
||||||
|
@ -280,7 +280,7 @@ SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
|
||||||
} else if (sample < -1.0f) {
|
} else if (sample < -1.0f) {
|
||||||
*dst = -2147483647;
|
*dst = -2147483647;
|
||||||
} else {
|
} else {
|
||||||
*dst = (Sint32)((double)sample * 2147483647.0);
|
*dst = ((Sint32)(sample * 8388607.0f)) << 8;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -509,16 +509,6 @@ SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__GNUC__) && (__GNUC__ < 4)
|
|
||||||
/* these were added as of gcc-4.0: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19418 */
|
|
||||||
static inline __m128 _mm_castsi128_ps(__m128i __A) {
|
|
||||||
return (__m128) __A;
|
|
||||||
}
|
|
||||||
static inline __m128i _mm_castps_si128(__m128 __A) {
|
|
||||||
return (__m128i) __A;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static void SDLCALL
|
static void SDLCALL
|
||||||
SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
|
SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
|
||||||
{
|
{
|
||||||
|
@ -530,7 +520,7 @@ SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
|
||||||
|
|
||||||
/* Get dst aligned to 16 bytes */
|
/* Get dst aligned to 16 bytes */
|
||||||
for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
|
for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
|
||||||
*dst = (float) (((double) *src) * DIVBY2147483648);
|
*dst = ((float) (*src>>8)) * DIVBY8388607;
|
||||||
}
|
}
|
||||||
|
|
||||||
SDL_assert(!i || ((((size_t) dst) & 15) == 0));
|
SDL_assert(!i || ((((size_t) dst) & 15) == 0));
|
||||||
|
@ -538,15 +528,11 @@ SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
|
||||||
|
|
||||||
{
|
{
|
||||||
/* Aligned! Do SSE blocks as long as we have 16 bytes available. */
|
/* Aligned! Do SSE blocks as long as we have 16 bytes available. */
|
||||||
const __m128d divby2147483648 = _mm_set1_pd(DIVBY2147483648);
|
const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);
|
||||||
const __m128i *mmsrc = (const __m128i *) src;
|
const __m128i *mmsrc = (const __m128i *) src;
|
||||||
while (i >= 4) { /* 4 * sint32 */
|
while (i >= 4) { /* 4 * sint32 */
|
||||||
const __m128i ints = _mm_load_si128(mmsrc);
|
/* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
|
||||||
/* bitshift the whole register over, so _mm_cvtepi32_pd can read the top ints in the bottom of the vector. */
|
_mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srli_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));
|
||||||
const __m128d doubles1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(ints, 8)), divby2147483648);
|
|
||||||
const __m128d doubles2 = _mm_mul_pd(_mm_cvtepi32_pd(ints), divby2147483648);
|
|
||||||
/* convert to float32, bitshift/or to get these into a vector to store. */
|
|
||||||
_mm_store_ps(dst, _mm_castsi128_ps(_mm_or_si128(_mm_slli_si128(_mm_castps_si128(_mm_cvtpd_ps(doubles1)), 8), _mm_castps_si128(_mm_cvtpd_ps(doubles2)))));
|
|
||||||
i -= 4; mmsrc++; dst += 4;
|
i -= 4; mmsrc++; dst += 4;
|
||||||
}
|
}
|
||||||
src = (const Sint32 *) mmsrc;
|
src = (const Sint32 *) mmsrc;
|
||||||
|
@ -554,7 +540,7 @@ SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
|
||||||
|
|
||||||
/* Finish off any leftovers with scalar operations. */
|
/* Finish off any leftovers with scalar operations. */
|
||||||
while (i) {
|
while (i) {
|
||||||
*dst = (float) (((double) *src) * DIVBY2147483648);
|
*dst = ((float) (*src>>8)) * DIVBY8388607;
|
||||||
i--; src++; dst++;
|
i--; src++; dst++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -755,7 +741,7 @@ SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
|
||||||
|
|
||||||
/* Get dst aligned to 16 bytes */
|
/* Get dst aligned to 16 bytes */
|
||||||
for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
|
for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
|
||||||
*dst = (Sint32) (((double) *src) * 2147483647.0);
|
*dst = ((Sint32)(*src * 8388607.0f)) << 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
SDL_assert(!i || ((((size_t) dst) & 15) == 0));
|
SDL_assert(!i || ((((size_t) dst) & 15) == 0));
|
||||||
|
@ -763,14 +749,10 @@ SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
|
||||||
|
|
||||||
{
|
{
|
||||||
/* Aligned! Do SSE blocks as long as we have 16 bytes available. */
|
/* Aligned! Do SSE blocks as long as we have 16 bytes available. */
|
||||||
const __m128d mulby2147483647 = _mm_set1_pd(2147483647.0);
|
const __m128 mulby8388607 = _mm_set1_ps(8388607.0f);
|
||||||
__m128i *mmdst = (__m128i *) dst;
|
__m128i *mmdst = (__m128i *) dst;
|
||||||
while (i >= 4) { /* 4 * float32 */
|
while (i >= 4) { /* 4 * float32 */
|
||||||
const __m128 floats = _mm_load_ps(src);
|
_mm_store_si128(mmdst, _mm_slli_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby8388607)), 8));
|
||||||
/* bitshift the whole register over, so _mm_cvtps_pd can read the top floats in the bottom of the vector. */
|
|
||||||
const __m128d doubles1 = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(floats), 8))), mulby2147483647);
|
|
||||||
const __m128d doubles2 = _mm_mul_pd(_mm_cvtps_pd(floats), mulby2147483647);
|
|
||||||
_mm_store_si128(mmdst, _mm_or_si128(_mm_slli_si128(_mm_cvtpd_epi32(doubles1), 8), _mm_cvtpd_epi32(doubles2)));
|
|
||||||
i -= 4; src += 4; mmdst++;
|
i -= 4; src += 4; mmdst++;
|
||||||
}
|
}
|
||||||
dst = (Sint32 *) mmdst;
|
dst = (Sint32 *) mmdst;
|
||||||
|
@ -778,7 +760,7 @@ SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
|
||||||
|
|
||||||
/* Finish off any leftovers with scalar operations. */
|
/* Finish off any leftovers with scalar operations. */
|
||||||
while (i) {
|
while (i) {
|
||||||
*dst = (Sint32) (((double) *src) * 2147483647.0);
|
*dst = ((Sint32)(*src * 8388607.0f)) << 8;
|
||||||
i--; src++; dst++;
|
i--; src++; dst++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue