diff --git a/src/audio/SDL_audiotypecvt.c b/src/audio/SDL_audiotypecvt.c index 43cabd9b5..23ad9d443 100644 --- a/src/audio/SDL_audiotypecvt.c +++ b/src/audio/SDL_audiotypecvt.c @@ -22,9 +22,6 @@ #include "SDL_sysaudio.h" -// TODO: NEON is disabled until https://github.com/libsdl-org/SDL/issues/8352 can be fixed -#undef SDL_NEON_INTRINSICS - #define DIVBY2147483648 0.0000000004656612873077392578125f // 0x1p-31f // start fallback scalar converters @@ -186,11 +183,27 @@ static void SDL_Convert_F32_to_S32_Scalar(Sint32 *dst, const float *src, int num // end fallback scalar converters +// Convert forwards, when sizeof(*src) >= sizeof(*dst) +#define CONVERT_16_FWD(CVT1, CVT16) \ + int i = 0; \ + if (num_samples >= 16) { \ + while ((uintptr_t)(&dst[i]) & 15) { CVT1 ++i; } \ + while ((i + 16) <= num_samples) { CVT16 i += 16; } \ + } \ + while (i < num_samples) { CVT1 ++i; } + +// Convert backwards, when sizeof(*src) <= sizeof(*dst) +#define CONVERT_16_REV(CVT1, CVT16) \ + int i = num_samples; \ + if (i >= 16) { \ + while ((uintptr_t)(&dst[i]) & 15) { --i; CVT1 } \ + while (i >= 16) { i -= 16; CVT16 } \ + } \ + while (i > 0) { --i; CVT1 } + #ifdef SDL_SSE2_INTRINSICS static void SDL_TARGETING("sse2") SDL_Convert_S8_to_F32_SSE2(float *dst, const Sint8 *src, int num_samples) { - int i = num_samples; - /* 1) Flip the sign bit to convert from S8 to U8 format * 2) Construct a float in the range [65536.0, 65538.0) * 3) Shift the float range to [-1.0, 1.0) @@ -202,35 +215,28 @@ static void SDL_TARGETING("sse2") SDL_Convert_S8_to_F32_SSE2(float *dst, const S LOG_DEBUG_AUDIO_CONVERT("S8", "F32 (using SSE2)"); - while (i >= 16) { - i -= 16; - + CONVERT_16_REV({ + _mm_store_ss(&dst[i], _mm_add_ss(_mm_castsi128_ps(_mm_cvtsi32_si128((Uint8)src[i] ^ 0x47800080u)), offset)); + }, { const __m128i bytes = _mm_xor_si128(_mm_loadu_si128((const __m128i *)&src[i]), flipper); - const __m128i shorts1 = _mm_unpacklo_epi8(bytes, zero); - const __m128i shorts2 = _mm_unpackhi_epi8(bytes, zero); + const __m128i shorts0 = _mm_unpacklo_epi8(bytes, zero); + const __m128i shorts1 = _mm_unpackhi_epi8(bytes, zero); - const __m128 floats1 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts1, caster)), offset); - const __m128 floats2 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts1, caster)), offset); - const __m128 floats3 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts2, caster)), offset); - const __m128 floats4 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts2, caster)), offset); + const __m128 floats0 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts0, caster)), offset); + const __m128 floats1 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts0, caster)), offset); + const __m128 floats2 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts1, caster)), offset); + const __m128 floats3 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts1, caster)), offset); - _mm_storeu_ps(&dst[i], floats1); - _mm_storeu_ps(&dst[i + 4], floats2); - _mm_storeu_ps(&dst[i + 8], floats3); - _mm_storeu_ps(&dst[i + 12], floats4); - } - - while (i) { - --i; - _mm_store_ss(&dst[i], _mm_add_ss(_mm_castsi128_ps(_mm_cvtsi32_si128((Uint8)src[i] ^ 0x47800080u)), offset)); - } + _mm_store_ps(&dst[i], floats0); + _mm_store_ps(&dst[i + 4], floats1); + _mm_store_ps(&dst[i + 8], floats2); + _mm_store_ps(&dst[i + 12], floats3); + }) } static void SDL_TARGETING("sse2") SDL_Convert_U8_to_F32_SSE2(float *dst, const Uint8 *src, int num_samples) { - int i = num_samples; - /* 1) Construct a float in the range [65536.0, 65538.0) * 2) Shift the float range to [-1.0, 1.0) * dst[i] = i2f(src[i] | 0x47800000) - 65537.0 */ @@ -240,35 +246,28 @@ static void SDL_TARGETING("sse2") SDL_Convert_U8_to_F32_SSE2(float *dst, const U LOG_DEBUG_AUDIO_CONVERT("U8", "F32 (using SSE2)"); - while (i >= 16) { - i -= 16; - + CONVERT_16_REV({ + _mm_store_ss(&dst[i], _mm_add_ss(_mm_castsi128_ps(_mm_cvtsi32_si128((Uint8)src[i] ^ 0x47800000u)), offset)); + }, { const __m128i bytes = _mm_loadu_si128((const __m128i *)&src[i]); - const __m128i shorts1 = _mm_unpacklo_epi8(bytes, zero); - const __m128i shorts2 = _mm_unpackhi_epi8(bytes, zero); + const __m128i shorts0 = _mm_unpacklo_epi8(bytes, zero); + const __m128i shorts1 = _mm_unpackhi_epi8(bytes, zero); - const __m128 floats1 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts1, caster)), offset); - const __m128 floats2 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts1, caster)), offset); - const __m128 floats3 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts2, caster)), offset); - const __m128 floats4 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts2, caster)), offset); + const __m128 floats0 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts0, caster)), offset); + const __m128 floats1 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts0, caster)), offset); + const __m128 floats2 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts1, caster)), offset); + const __m128 floats3 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts1, caster)), offset); - _mm_storeu_ps(&dst[i], floats1); - _mm_storeu_ps(&dst[i + 4], floats2); - _mm_storeu_ps(&dst[i + 8], floats3); - _mm_storeu_ps(&dst[i + 12], floats4); - } - - while (i) { - --i; - _mm_store_ss(&dst[i], _mm_add_ss(_mm_castsi128_ps(_mm_cvtsi32_si128((Uint8)src[i] ^ 0x47800000u)), offset)); - } + _mm_store_ps(&dst[i], floats0); + _mm_store_ps(&dst[i + 4], floats1); + _mm_store_ps(&dst[i + 8], floats2); + _mm_store_ps(&dst[i + 12], floats3); + }) } static void SDL_TARGETING("sse2") SDL_Convert_S16_to_F32_SSE2(float *dst, const Sint16 *src, int num_samples) { - int i = num_samples; - /* 1) Flip the sign bit to convert from S16 to U16 format * 2) Construct a float in the range [256.0, 258.0) * 3) Shift the float range to [-1.0, 1.0) @@ -279,67 +278,53 @@ static void SDL_TARGETING("sse2") SDL_Convert_S16_to_F32_SSE2(float *dst, const LOG_DEBUG_AUDIO_CONVERT("S16", "F32 (using SSE2)"); - while (i >= 16) { - i -= 16; - - const __m128i shorts1 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)&src[i]), flipper); - const __m128i shorts2 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)&src[i + 8]), flipper); - - const __m128 floats1 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts1, caster)), offset); - const __m128 floats2 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts1, caster)), offset); - const __m128 floats3 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts2, caster)), offset); - const __m128 floats4 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts2, caster)), offset); - - _mm_storeu_ps(&dst[i], floats1); - _mm_storeu_ps(&dst[i + 4], floats2); - _mm_storeu_ps(&dst[i + 8], floats3); - _mm_storeu_ps(&dst[i + 12], floats4); - } - - while (i) { - --i; + CONVERT_16_REV({ _mm_store_ss(&dst[i], _mm_add_ss(_mm_castsi128_ps(_mm_cvtsi32_si128((Uint16)src[i] ^ 0x43808000u)), offset)); - } + }, { + const __m128i shorts0 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)&src[i]), flipper); + const __m128i shorts1 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)&src[i + 8]), flipper); + + const __m128 floats0 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts0, caster)), offset); + const __m128 floats1 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts0, caster)), offset); + const __m128 floats2 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts1, caster)), offset); + const __m128 floats3 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts1, caster)), offset); + + _mm_store_ps(&dst[i], floats0); + _mm_store_ps(&dst[i + 4], floats1); + _mm_store_ps(&dst[i + 8], floats2); + _mm_store_ps(&dst[i + 12], floats3); + }) } static void SDL_TARGETING("sse2") SDL_Convert_S32_to_F32_SSE2(float *dst, const Sint32 *src, int num_samples) { - int i = num_samples; - // dst[i] = f32(src[i]) / f32(0x80000000) const __m128 scaler = _mm_set1_ps(DIVBY2147483648); LOG_DEBUG_AUDIO_CONVERT("S32", "F32 (using SSE2)"); - while (i >= 16) { - i -= 16; - - const __m128i ints1 = _mm_loadu_si128((const __m128i *)&src[i]); - const __m128i ints2 = _mm_loadu_si128((const __m128i *)&src[i + 4]); - const __m128i ints3 = _mm_loadu_si128((const __m128i *)&src[i + 8]); - const __m128i ints4 = _mm_loadu_si128((const __m128i *)&src[i + 12]); + CONVERT_16_FWD({ + _mm_store_ss(&dst[i], _mm_mul_ss(_mm_cvt_si2ss(_mm_setzero_ps(), src[i]), scaler)); + }, { + const __m128i ints0 = _mm_loadu_si128((const __m128i *)&src[i]); + const __m128i ints1 = _mm_loadu_si128((const __m128i *)&src[i + 4]); + const __m128i ints2 = _mm_loadu_si128((const __m128i *)&src[i + 8]); + const __m128i ints3 = _mm_loadu_si128((const __m128i *)&src[i + 12]); + const __m128 floats0 = _mm_mul_ps(_mm_cvtepi32_ps(ints0), scaler); const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(ints1), scaler); const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(ints2), scaler); const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(ints3), scaler); - const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(ints4), scaler); - _mm_storeu_ps(&dst[i], floats1); - _mm_storeu_ps(&dst[i + 4], floats2); - _mm_storeu_ps(&dst[i + 8], floats3); - _mm_storeu_ps(&dst[i + 12], floats4); - } - - while (i) { - --i; - _mm_store_ss(&dst[i], _mm_mul_ss(_mm_cvt_si2ss(_mm_setzero_ps(), src[i]), scaler)); - } + _mm_store_ps(&dst[i], floats0); + _mm_store_ps(&dst[i + 4], floats1); + _mm_store_ps(&dst[i + 8], floats2); + _mm_store_ps(&dst[i + 12], floats3); + }) } static void SDL_TARGETING("sse2") SDL_Convert_F32_to_S8_SSE2(Sint8 *dst, const float *src, int num_samples) { - int i = num_samples; - /* 1) Shift the float range from [-1.0, 1.0] to [98303.0, 98305.0] * 2) Extract the lowest 16 bits and clamp to [-128, 127] * Overflow is correctly handled for inputs between roughly [-255.0, 255.0] @@ -349,43 +334,31 @@ static void SDL_TARGETING("sse2") SDL_Convert_F32_to_S8_SSE2(Sint8 *dst, const f LOG_DEBUG_AUDIO_CONVERT("F32", "S8 (using SSE2)"); - while (i >= 16) { - const __m128 floats1 = _mm_loadu_ps(&src[0]); - const __m128 floats2 = _mm_loadu_ps(&src[4]); - const __m128 floats3 = _mm_loadu_ps(&src[8]); - const __m128 floats4 = _mm_loadu_ps(&src[12]); + CONVERT_16_FWD({ + const __m128i ints = _mm_castps_si128(_mm_add_ss(_mm_load_ss(&src[i]), offset)); + dst[i] = (Sint8)(_mm_cvtsi128_si32(_mm_packs_epi16(ints, ints)) & 0xFF); + }, { + const __m128 floats0 = _mm_loadu_ps(&src[i]); + const __m128 floats1 = _mm_loadu_ps(&src[i + 4]); + const __m128 floats2 = _mm_loadu_ps(&src[i + 8]); + const __m128 floats3 = _mm_loadu_ps(&src[i + 12]); + const __m128i ints0 = _mm_castps_si128(_mm_add_ps(floats0, offset)); const __m128i ints1 = _mm_castps_si128(_mm_add_ps(floats1, offset)); const __m128i ints2 = _mm_castps_si128(_mm_add_ps(floats2, offset)); const __m128i ints3 = _mm_castps_si128(_mm_add_ps(floats3, offset)); - const __m128i ints4 = _mm_castps_si128(_mm_add_ps(floats4, offset)); - const __m128i shorts1 = _mm_and_si128(_mm_packs_epi16(ints1, ints2), mask); - const __m128i shorts2 = _mm_and_si128(_mm_packs_epi16(ints3, ints4), mask); + const __m128i shorts0 = _mm_and_si128(_mm_packs_epi16(ints0, ints1), mask); + const __m128i shorts1 = _mm_and_si128(_mm_packs_epi16(ints2, ints3), mask); - const __m128i bytes = _mm_packus_epi16(shorts1, shorts2); + const __m128i bytes = _mm_packus_epi16(shorts0, shorts1); - _mm_storeu_si128((__m128i*)dst, bytes); - - i -= 16; - src += 16; - dst += 16; - } - - while (i) { - const __m128i ints = _mm_castps_si128(_mm_add_ss(_mm_load_ss(src), offset)); - *dst = (Sint8)(_mm_cvtsi128_si32(_mm_packs_epi16(ints, ints)) & 0xFF); - - --i; - ++src; - ++dst; - } + _mm_store_si128((__m128i*)&dst[i], bytes); + }) } static void SDL_TARGETING("sse2") SDL_Convert_F32_to_U8_SSE2(Uint8 *dst, const float *src, int num_samples) { - int i = num_samples; - /* 1) Shift the float range from [-1.0, 1.0] to [98304.0, 98306.0] * 2) Extract the lowest 16 bits and clamp to [0, 255] * Overflow is correctly handled for inputs between roughly [-254.0, 254.0] @@ -395,43 +368,31 @@ static void SDL_TARGETING("sse2") SDL_Convert_F32_to_U8_SSE2(Uint8 *dst, const f LOG_DEBUG_AUDIO_CONVERT("F32", "U8 (using SSE2)"); - while (i >= 16) { - const __m128 floats1 = _mm_loadu_ps(&src[0]); - const __m128 floats2 = _mm_loadu_ps(&src[4]); - const __m128 floats3 = _mm_loadu_ps(&src[8]); - const __m128 floats4 = _mm_loadu_ps(&src[12]); + CONVERT_16_FWD({ + const __m128i ints = _mm_castps_si128(_mm_add_ss(_mm_load_ss(&src[i]), offset)); + dst[i] = (Uint8)(_mm_cvtsi128_si32(_mm_packus_epi16(ints, ints)) & 0xFF); + }, { + const __m128 floats0 = _mm_loadu_ps(&src[i]); + const __m128 floats1 = _mm_loadu_ps(&src[i + 4]); + const __m128 floats2 = _mm_loadu_ps(&src[i + 8]); + const __m128 floats3 = _mm_loadu_ps(&src[i + 12]); + const __m128i ints0 = _mm_castps_si128(_mm_add_ps(floats0, offset)); const __m128i ints1 = _mm_castps_si128(_mm_add_ps(floats1, offset)); const __m128i ints2 = _mm_castps_si128(_mm_add_ps(floats2, offset)); const __m128i ints3 = _mm_castps_si128(_mm_add_ps(floats3, offset)); - const __m128i ints4 = _mm_castps_si128(_mm_add_ps(floats4, offset)); - const __m128i shorts1 = _mm_and_si128(_mm_packus_epi16(ints1, ints2), mask); - const __m128i shorts2 = _mm_and_si128(_mm_packus_epi16(ints3, ints4), mask); + const __m128i shorts0 = _mm_and_si128(_mm_packus_epi16(ints0, ints1), mask); + const __m128i shorts1 = _mm_and_si128(_mm_packus_epi16(ints2, ints3), mask); - const __m128i bytes = _mm_packus_epi16(shorts1, shorts2); + const __m128i bytes = _mm_packus_epi16(shorts0, shorts1); - _mm_storeu_si128((__m128i*)dst, bytes); - - i -= 16; - src += 16; - dst += 16; - } - - while (i) { - const __m128i ints = _mm_castps_si128(_mm_add_ss(_mm_load_ss(src), offset)); - *dst = (Uint8)(_mm_cvtsi128_si32(_mm_packus_epi16(ints, ints)) & 0xFF); - - --i; - ++src; - ++dst; - } + _mm_store_si128((__m128i*)&dst[i], bytes); + }) } static void SDL_TARGETING("sse2") SDL_Convert_F32_to_S16_SSE2(Sint16 *dst, const float *src, int num_samples) { - int i = num_samples; - /* 1) Shift the float range from [-1.0, 1.0] to [256.0, 258.0] * 2) Shift the int range from [0x43800000, 0x43810000] to [-32768,32768] * 3) Clamp to range [-32768,32767] @@ -441,42 +402,30 @@ static void SDL_TARGETING("sse2") SDL_Convert_F32_to_S16_SSE2(Sint16 *dst, const LOG_DEBUG_AUDIO_CONVERT("F32", "S16 (using SSE2)"); - while (i >= 16) { - const __m128 floats1 = _mm_loadu_ps(&src[0]); - const __m128 floats2 = _mm_loadu_ps(&src[4]); - const __m128 floats3 = _mm_loadu_ps(&src[8]); - const __m128 floats4 = _mm_loadu_ps(&src[12]); + CONVERT_16_FWD({ + const __m128i ints = _mm_sub_epi32(_mm_castps_si128(_mm_add_ss(_mm_load_ss(&src[i]), offset)), _mm_castps_si128(offset)); + dst[i] = (Sint16)(_mm_cvtsi128_si32(_mm_packs_epi32(ints, ints)) & 0xFFFF); + }, { + const __m128 floats0 = _mm_loadu_ps(&src[i]); + const __m128 floats1 = _mm_loadu_ps(&src[i + 4]); + const __m128 floats2 = _mm_loadu_ps(&src[i + 8]); + const __m128 floats3 = _mm_loadu_ps(&src[i + 12]); + const __m128i ints0 = _mm_sub_epi32(_mm_castps_si128(_mm_add_ps(floats0, offset)), _mm_castps_si128(offset)); const __m128i ints1 = _mm_sub_epi32(_mm_castps_si128(_mm_add_ps(floats1, offset)), _mm_castps_si128(offset)); const __m128i ints2 = _mm_sub_epi32(_mm_castps_si128(_mm_add_ps(floats2, offset)), _mm_castps_si128(offset)); const __m128i ints3 = _mm_sub_epi32(_mm_castps_si128(_mm_add_ps(floats3, offset)), _mm_castps_si128(offset)); - const __m128i ints4 = _mm_sub_epi32(_mm_castps_si128(_mm_add_ps(floats4, offset)), _mm_castps_si128(offset)); - const __m128i shorts1 = _mm_packs_epi32(ints1, ints2); - const __m128i shorts2 = _mm_packs_epi32(ints3, ints4); + const __m128i shorts0 = _mm_packs_epi32(ints0, ints1); + const __m128i shorts1 = _mm_packs_epi32(ints2, ints3); - _mm_storeu_si128((__m128i*)&dst[0], shorts1); - _mm_storeu_si128((__m128i*)&dst[8], shorts2); - - i -= 16; - src += 16; - dst += 16; - } - - while (i) { - const __m128i ints = _mm_sub_epi32(_mm_castps_si128(_mm_add_ss(_mm_load_ss(src), offset)), _mm_castps_si128(offset)); - *dst = (Sint16)(_mm_cvtsi128_si32(_mm_packs_epi32(ints, ints)) & 0xFFFF); - - --i; - ++src; - ++dst; - } + _mm_store_si128((__m128i*)&dst[i], shorts0); + _mm_store_si128((__m128i*)&dst[i + 8], shorts1); + }) } static void SDL_TARGETING("sse2") SDL_Convert_F32_to_S32_SSE2(Sint32 *dst, const float *src, int num_samples) { - int i = num_samples; - /* 1) Scale the float range from [-1.0, 1.0] to [-2147483648.0, 2147483648.0] * 2) Convert to integer (values too small/large become 0x80000000 = -2147483648) * 3) Fixup values which were too large (0x80000000 ^ 0xFFFFFFFF = 2147483647) @@ -485,458 +434,237 @@ static void SDL_TARGETING("sse2") SDL_Convert_F32_to_S32_SSE2(Sint32 *dst, const LOG_DEBUG_AUDIO_CONVERT("F32", "S32 (using SSE2)"); - while (i >= 16) { - const __m128 floats1 = _mm_loadu_ps(&src[0]); - const __m128 floats2 = _mm_loadu_ps(&src[4]); - const __m128 floats3 = _mm_loadu_ps(&src[8]); - const __m128 floats4 = _mm_loadu_ps(&src[12]); - - const __m128 values1 = _mm_mul_ps(floats1, limit); - const __m128 values2 = _mm_mul_ps(floats2, limit); - const __m128 values3 = _mm_mul_ps(floats3, limit); - const __m128 values4 = _mm_mul_ps(floats4, limit); - - const __m128i ints1 = _mm_xor_si128(_mm_cvttps_epi32(values1), _mm_castps_si128(_mm_cmpge_ps(values1, limit))); - const __m128i ints2 = _mm_xor_si128(_mm_cvttps_epi32(values2), _mm_castps_si128(_mm_cmpge_ps(values2, limit))); - const __m128i ints3 = _mm_xor_si128(_mm_cvttps_epi32(values3), _mm_castps_si128(_mm_cmpge_ps(values3, limit))); - const __m128i ints4 = _mm_xor_si128(_mm_cvttps_epi32(values4), _mm_castps_si128(_mm_cmpge_ps(values4, limit))); - - _mm_storeu_si128((__m128i*)&dst[0], ints1); - _mm_storeu_si128((__m128i*)&dst[4], ints2); - _mm_storeu_si128((__m128i*)&dst[8], ints3); - _mm_storeu_si128((__m128i*)&dst[12], ints4); - - i -= 16; - src += 16; - dst += 16; - } - - while (i) { - const __m128 floats = _mm_load_ss(src); + CONVERT_16_FWD({ + const __m128 floats = _mm_load_ss(&src[i]); const __m128 values = _mm_mul_ss(floats, limit); const __m128i ints = _mm_xor_si128(_mm_cvttps_epi32(values), _mm_castps_si128(_mm_cmpge_ss(values, limit))); - *dst = (Sint32)_mm_cvtsi128_si32(ints); + dst[i] = (Sint32)_mm_cvtsi128_si32(ints); + }, { + const __m128 floats0 = _mm_loadu_ps(&src[i]); + const __m128 floats1 = _mm_loadu_ps(&src[i + 4]); + const __m128 floats2 = _mm_loadu_ps(&src[i + 8]); + const __m128 floats3 = _mm_loadu_ps(&src[i + 12]); - --i; - ++src; - ++dst; - } + const __m128 values1 = _mm_mul_ps(floats0, limit); + const __m128 values2 = _mm_mul_ps(floats1, limit); + const __m128 values3 = _mm_mul_ps(floats2, limit); + const __m128 values4 = _mm_mul_ps(floats3, limit); + + const __m128i ints0 = _mm_xor_si128(_mm_cvttps_epi32(values1), _mm_castps_si128(_mm_cmpge_ps(values1, limit))); + const __m128i ints1 = _mm_xor_si128(_mm_cvttps_epi32(values2), _mm_castps_si128(_mm_cmpge_ps(values2, limit))); + const __m128i ints2 = _mm_xor_si128(_mm_cvttps_epi32(values3), _mm_castps_si128(_mm_cmpge_ps(values3, limit))); + const __m128i ints3 = _mm_xor_si128(_mm_cvttps_epi32(values4), _mm_castps_si128(_mm_cmpge_ps(values4, limit))); + + _mm_store_si128((__m128i*)&dst[i], ints0); + _mm_store_si128((__m128i*)&dst[i + 4], ints1); + _mm_store_si128((__m128i*)&dst[i + 8], ints2); + _mm_store_si128((__m128i*)&dst[i + 12], ints3); + }) } #endif #ifdef SDL_NEON_INTRINSICS -#define DIVBY128 0.0078125f // 0x1p-7f -#define DIVBY32768 0.000030517578125f // 0x1p-15f -#define DIVBY8388607 0.00000011920930376163766f // 0x1.000002p-23f - static void SDL_Convert_S8_to_F32_NEON(float *dst, const Sint8 *src, int num_samples) { - int i; - LOG_DEBUG_AUDIO_CONVERT("S8", "F32 (using NEON)"); - src += num_samples - 1; - dst += num_samples - 1; + CONVERT_16_REV({ + vst1_lane_f32(&dst[i], vcvt_n_f32_s32(vdup_n_s32(src[i]), 7), 0); + }, { + int8x16_t bytes = vld1q_s8(&src[i]); - // Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) - for (i = num_samples; i && (((size_t)(dst - 15)) & 15); --i, --src, --dst) { - *dst = ((float)*src) * DIVBY128; - } + int16x8_t shorts0 = vmovl_s8(vget_low_s8(bytes)); + int16x8_t shorts1 = vmovl_s8(vget_high_s8(bytes)); - src -= 15; - dst -= 15; // adjust to read NEON blocks from the start. - SDL_assert(!i || !(((size_t)dst) & 15)); + float32x4_t floats0 = vcvtq_n_f32_s32(vmovl_s16(vget_low_s16(shorts0)), 7); + float32x4_t floats1 = vcvtq_n_f32_s32(vmovl_s16(vget_high_s16(shorts0)), 7); + float32x4_t floats2 = vcvtq_n_f32_s32(vmovl_s16(vget_low_s16(shorts1)), 7); + float32x4_t floats3 = vcvtq_n_f32_s32(vmovl_s16(vget_high_s16(shorts1)), 7); - // Make sure src is aligned too. - if (!(((size_t)src) & 15)) { - // Aligned! Do NEON blocks as long as we have 16 bytes available. - const int8_t *mmsrc = (const int8_t *)src; - const float32x4_t divby128 = vdupq_n_f32(DIVBY128); - while (i >= 16) { // 16 * 8-bit - const int8x16_t bytes = vld1q_s8(mmsrc); // get 16 sint8 into a NEON register. - const int16x8_t int16hi = vmovl_s8(vget_high_s8(bytes)); // convert top 8 bytes to 8 int16 - const int16x8_t int16lo = vmovl_s8(vget_low_s8(bytes)); // convert bottom 8 bytes to 8 int16 - // split int16 to two int32, then convert to float, then multiply to normalize, store. - vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16lo))), divby128)); - vst1q_f32(dst + 4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16lo))), divby128)); - vst1q_f32(dst + 8, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16hi))), divby128)); - vst1q_f32(dst + 12, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16hi))), divby128)); - i -= 16; - mmsrc -= 16; - dst -= 16; - } - - src = (const Sint8 *)mmsrc; - } - - src += 15; - dst += 15; // adjust for any scalar finishing. - - // Finish off any leftovers with scalar operations. - while (i) { - *dst = ((float)*src) * DIVBY128; - i--; - src--; - dst--; - } + vst1q_f32(&dst[i], floats0); + vst1q_f32(&dst[i + 4], floats1); + vst1q_f32(&dst[i + 8], floats2); + vst1q_f32(&dst[i + 12], floats3); + }) } static void SDL_Convert_U8_to_F32_NEON(float *dst, const Uint8 *src, int num_samples) { - int i; - LOG_DEBUG_AUDIO_CONVERT("U8", "F32 (using NEON)"); - src += num_samples - 1; - dst += num_samples - 1; + uint8x16_t flipper = vdupq_n_u8(0x80); - // Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) - for (i = num_samples; i && (((size_t)(dst - 15)) & 15); --i, --src, --dst) { - *dst = (((float)*src) * DIVBY128) - 1.0f; - } + CONVERT_16_REV({ + vst1_lane_f32(&dst[i], vcvt_n_f32_s32(vdup_n_s32((Sint8)(src[i] ^ 0x80)), 7), 0); + }, { + int8x16_t bytes = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(&src[i]), flipper)); - src -= 15; - dst -= 15; // adjust to read NEON blocks from the start. - SDL_assert(!i || !(((size_t)dst) & 15)); + int16x8_t shorts0 = vmovl_s8(vget_low_s8(bytes)); + int16x8_t shorts1 = vmovl_s8(vget_high_s8(bytes)); - // Make sure src is aligned too. - if (!(((size_t)src) & 15)) { - // Aligned! Do NEON blocks as long as we have 16 bytes available. - const uint8_t *mmsrc = (const uint8_t *)src; - const float32x4_t divby128 = vdupq_n_f32(DIVBY128); - const float32x4_t negone = vdupq_n_f32(-1.0f); - while (i >= 16) { // 16 * 8-bit - const uint8x16_t bytes = vld1q_u8(mmsrc); // get 16 uint8 into a NEON register. - const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes)); // convert top 8 bytes to 8 uint16 - const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes)); // convert bottom 8 bytes to 8 uint16 - // split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. - vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128)); - vst1q_f32(dst + 4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128)); - vst1q_f32(dst + 8, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128)); - vst1q_f32(dst + 12, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128)); - i -= 16; - mmsrc -= 16; - dst -= 16; - } + float32x4_t floats0 = vcvtq_n_f32_s32(vmovl_s16(vget_low_s16(shorts0)), 7); + float32x4_t floats1 = vcvtq_n_f32_s32(vmovl_s16(vget_high_s16(shorts0)), 7); + float32x4_t floats2 = vcvtq_n_f32_s32(vmovl_s16(vget_low_s16(shorts1)), 7); + float32x4_t floats3 = vcvtq_n_f32_s32(vmovl_s16(vget_high_s16(shorts1)), 7); - src = (const Uint8 *)mmsrc; - } - - src += 15; - dst += 15; // adjust for any scalar finishing. - - // Finish off any leftovers with scalar operations. - while (i) { - *dst = (((float)*src) * DIVBY128) - 1.0f; - i--; - src--; - dst--; - } + vst1q_f32(&dst[i], floats0); + vst1q_f32(&dst[i + 4], floats1); + vst1q_f32(&dst[i + 8], floats2); + vst1q_f32(&dst[i + 12], floats3); + }) } static void SDL_Convert_S16_to_F32_NEON(float *dst, const Sint16 *src, int num_samples) { - int i; - LOG_DEBUG_AUDIO_CONVERT("S16", "F32 (using NEON)"); - src += num_samples - 1; - dst += num_samples - 1; + CONVERT_16_REV({ + vst1_lane_f32(&dst[i], vcvt_n_f32_s32(vdup_n_s32(src[i]), 15), 0); + }, { + int16x8_t shorts0 = vld1q_s16(&src[i]); + int16x8_t shorts1 = vld1q_s16(&src[i + 8]); - // Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) - for (i = num_samples; i && (((size_t)(dst - 7)) & 15); --i, --src, --dst) { - *dst = ((float)*src) * DIVBY32768; - } + float32x4_t floats0 = vcvtq_n_f32_s32(vmovl_s16(vget_low_s16(shorts0)), 15); + float32x4_t floats1 = vcvtq_n_f32_s32(vmovl_s16(vget_high_s16(shorts0)), 15); + float32x4_t floats2 = vcvtq_n_f32_s32(vmovl_s16(vget_low_s16(shorts1)), 15); + float32x4_t floats3 = vcvtq_n_f32_s32(vmovl_s16(vget_high_s16(shorts1)), 15); - src -= 7; - dst -= 7; // adjust to read NEON blocks from the start. - SDL_assert(!i || !(((size_t)dst) & 15)); - - // Make sure src is aligned too. - if (!(((size_t)src) & 15)) { - // Aligned! Do NEON blocks as long as we have 16 bytes available. - const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768); - while (i >= 8) { // 8 * 16-bit - const int16x8_t ints = vld1q_s16((int16_t const *)src); // get 8 sint16 into a NEON register. - // split int16 to two int32, then convert to float, then multiply to normalize, store. - vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768)); - vst1q_f32(dst + 4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768)); - i -= 8; - src -= 8; - dst -= 8; - } - } - - src += 7; - dst += 7; // adjust for any scalar finishing. - - // Finish off any leftovers with scalar operations. - while (i) { - *dst = ((float)*src) * DIVBY32768; - i--; - src--; - dst--; - } + vst1q_f32(&dst[i], floats0); + vst1q_f32(&dst[i + 4], floats1); + vst1q_f32(&dst[i + 8], floats2); + vst1q_f32(&dst[i + 12], floats3); + }) } static void SDL_Convert_S32_to_F32_NEON(float *dst, const Sint32 *src, int num_samples) { - int i; - LOG_DEBUG_AUDIO_CONVERT("S32", "F32 (using NEON)"); - // Get dst aligned to 16 bytes - for (i = num_samples; i && (((size_t)dst) & 15); --i, ++src, ++dst) { - *dst = ((float)(*src >> 8)) * DIVBY8388607; - } + CONVERT_16_FWD({ + vst1_lane_f32(&dst[i], vcvt_n_f32_s32(vld1_dup_s32(&src[i]), 31), 0); + }, { + int32x4_t ints0 = vld1q_s32(&src[i]); + int32x4_t ints1 = vld1q_s32(&src[i + 4]); + int32x4_t ints2 = vld1q_s32(&src[i + 8]); + int32x4_t ints3 = vld1q_s32(&src[i + 12]); - SDL_assert(!i || !(((size_t)dst) & 15)); + float32x4_t floats0 = vcvtq_n_f32_s32(ints0, 31); + float32x4_t floats1 = vcvtq_n_f32_s32(ints1, 31); + float32x4_t floats2 = vcvtq_n_f32_s32(ints2, 31); + float32x4_t floats3 = vcvtq_n_f32_s32(ints3, 31); - // Make sure src is aligned too. - if (!(((size_t)src) & 15)) { - // Aligned! Do NEON blocks as long as we have 16 bytes available. - const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607); - const int32_t *mmsrc = (const int32_t *)src; - while (i >= 4) { // 4 * sint32 - // shift out lowest bits so int fits in a float32. Small precision loss, but much faster. - vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607)); - i -= 4; - mmsrc += 4; - dst += 4; - } - src = (const Sint32 *)mmsrc; - } - - // Finish off any leftovers with scalar operations. - while (i) { - *dst = ((float)(*src >> 8)) * DIVBY8388607; - i--; - src++; - dst++; - } + vst1q_f32(&dst[i], floats0); + vst1q_f32(&dst[i + 4], floats1); + vst1q_f32(&dst[i + 8], floats2); + vst1q_f32(&dst[i + 12], floats3); + }) } static void SDL_Convert_F32_to_S8_NEON(Sint8 *dst, const float *src, int num_samples) { - int i; - LOG_DEBUG_AUDIO_CONVERT("F32", "S8 (using NEON)"); - // Get dst aligned to 16 bytes - for (i = num_samples; i && (((size_t)dst) & 15); --i, ++src, ++dst) { - const float sample = *src; - if (sample >= 1.0f) { - *dst = 127; - } else if (sample <= -1.0f) { - *dst = -128; - } else { - *dst = (Sint8)(sample * 127.0f); - } - } + CONVERT_16_FWD({ + vst1_lane_s8(&dst[i], vreinterpret_s8_s32(vcvt_n_s32_f32(vld1_dup_f32(&src[i]), 31)), 3); + }, { + float32x4_t floats0 = vld1q_f32(&src[i]); + float32x4_t floats1 = vld1q_f32(&src[i + 4]); + float32x4_t floats2 = vld1q_f32(&src[i + 8]); + float32x4_t floats3 = vld1q_f32(&src[i + 12]); - SDL_assert(!i || !(((size_t)dst) & 15)); + int32x4_t ints0 = vcvtq_n_s32_f32(floats0, 31); + int32x4_t ints1 = vcvtq_n_s32_f32(floats1, 31); + int32x4_t ints2 = vcvtq_n_s32_f32(floats2, 31); + int32x4_t ints3 = vcvtq_n_s32_f32(floats3, 31); - // Make sure src is aligned too. - if (!(((size_t)src) & 15)) { - // Aligned! Do NEON blocks as long as we have 16 bytes available. - const float32x4_t one = vdupq_n_f32(1.0f); - const float32x4_t negone = vdupq_n_f32(-1.0f); - const float32x4_t mulby127 = vdupq_n_f32(127.0f); - int8_t *mmdst = (int8_t *)dst; - while (i >= 16) { // 16 * float32 - const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby127)); // load 4 floats, clamp, convert to sint32 - const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src + 4)), one), mulby127)); // load 4 floats, clamp, convert to sint32 - const int32x4_t ints3 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src + 8)), one), mulby127)); // load 4 floats, clamp, convert to sint32 - const int32x4_t ints4 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src + 12)), one), mulby127)); // load 4 floats, clamp, convert to sint32 - const int8x8_t i8lo = vmovn_s16(vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2))); // narrow to sint16, combine, narrow to sint8 - const int8x8_t i8hi = vmovn_s16(vcombine_s16(vmovn_s32(ints3), vmovn_s32(ints4))); // narrow to sint16, combine, narrow to sint8 - vst1q_s8(mmdst, vcombine_s8(i8lo, i8hi)); // combine to int8x16_t, store out - i -= 16; - src += 16; - mmdst += 16; - } - dst = (Sint8 *)mmdst; - } + int16x8_t shorts0 = vcombine_s16(vshrn_n_s32(ints0, 16), vshrn_n_s32(ints1, 16)); + int16x8_t shorts1 = vcombine_s16(vshrn_n_s32(ints2, 16), vshrn_n_s32(ints3, 16)); - // Finish off any leftovers with scalar operations. - while (i) { - const float sample = *src; - if (sample >= 1.0f) { - *dst = 127; - } else if (sample <= -1.0f) { - *dst = -128; - } else { - *dst = (Sint8)(sample * 127.0f); - } - i--; - src++; - dst++; - } + int8x16_t bytes = vcombine_s8(vshrn_n_s16(shorts0, 8), vshrn_n_s16(shorts1, 8)); + + vst1q_s8(&dst[i], bytes); + }) } static void SDL_Convert_F32_to_U8_NEON(Uint8 *dst, const float *src, int num_samples) { - int i; - LOG_DEBUG_AUDIO_CONVERT("F32", "U8 (using NEON)"); - // Get dst aligned to 16 bytes - for (i = num_samples; i && (((size_t)dst) & 15); --i, ++src, ++dst) { - const float sample = *src; - if (sample >= 1.0f) { - *dst = 255; - } else if (sample <= -1.0f) { - *dst = 0; - } else { - *dst = (Uint8)((sample + 1.0f) * 127.0f); - } - } + uint8x16_t flipper = vdupq_n_u8(0x80); - SDL_assert(!i || !(((size_t)dst) & 15)); + CONVERT_16_FWD({ + vst1_lane_u8(&dst[i], + veor_u8(vreinterpret_u8_s32(vcvt_n_s32_f32(vld1_dup_f32(&src[i]), 31)), + vget_low_u8(flipper)), 3); + }, { + float32x4_t floats0 = vld1q_f32(&src[i]); + float32x4_t floats1 = vld1q_f32(&src[i + 4]); + float32x4_t floats2 = vld1q_f32(&src[i + 8]); + float32x4_t floats3 = vld1q_f32(&src[i + 12]); - // Make sure src is aligned too. - if (!(((size_t)src) & 15)) { - // Aligned! Do NEON blocks as long as we have 16 bytes available. - const float32x4_t one = vdupq_n_f32(1.0f); - const float32x4_t negone = vdupq_n_f32(-1.0f); - const float32x4_t mulby127 = vdupq_n_f32(127.0f); - uint8_t *mmdst = (uint8_t *)dst; - while (i >= 16) { // 16 * float32 - const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby127)); // load 4 floats, clamp, convert to uint32 - const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src + 4)), one), one), mulby127)); // load 4 floats, clamp, convert to uint32 - const uint32x4_t uints3 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src + 8)), one), one), mulby127)); // load 4 floats, clamp, convert to uint32 - const uint32x4_t uints4 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src + 12)), one), one), mulby127)); // load 4 floats, clamp, convert to uint32 - const uint8x8_t ui8lo = vmovn_u16(vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2))); // narrow to uint16, combine, narrow to uint8 - const uint8x8_t ui8hi = vmovn_u16(vcombine_u16(vmovn_u32(uints3), vmovn_u32(uints4))); // narrow to uint16, combine, narrow to uint8 - vst1q_u8(mmdst, vcombine_u8(ui8lo, ui8hi)); // combine to uint8x16_t, store out - i -= 16; - src += 16; - mmdst += 16; - } + int32x4_t ints0 = vcvtq_n_s32_f32(floats0, 31); + int32x4_t ints1 = vcvtq_n_s32_f32(floats1, 31); + int32x4_t ints2 = vcvtq_n_s32_f32(floats2, 31); + int32x4_t ints3 = vcvtq_n_s32_f32(floats3, 31); - dst = (Uint8 *)mmdst; - } + int16x8_t shorts0 = vcombine_s16(vshrn_n_s32(ints0, 16), vshrn_n_s32(ints1, 16)); + int16x8_t shorts1 = vcombine_s16(vshrn_n_s32(ints2, 16), vshrn_n_s32(ints3, 16)); - // Finish off any leftovers with scalar operations. - while (i) { - const float sample = *src; - if (sample >= 1.0f) { - *dst = 255; - } else if (sample <= -1.0f) { - *dst = 0; - } else { - *dst = (Uint8)((sample + 1.0f) * 127.0f); - } - i--; - src++; - dst++; - } + uint8x16_t bytes = veorq_u8(vreinterpretq_u8_s8( + vcombine_s8(vshrn_n_s16(shorts0, 8), vshrn_n_s16(shorts1, 8))), + flipper); + + vst1q_u8(&dst[i], bytes); + }) } static void SDL_Convert_F32_to_S16_NEON(Sint16 *dst, const float *src, int num_samples) { - int i; - LOG_DEBUG_AUDIO_CONVERT("F32", "S16 (using NEON)"); - // Get dst aligned to 16 bytes - for (i = num_samples; i && (((size_t)dst) & 15); --i, ++src, ++dst) { - const float sample = *src; - if (sample >= 1.0f) { - *dst = 32767; - } else if (sample <= -1.0f) { - *dst = -32768; - } else { - *dst = (Sint16)(sample * 32767.0f); - } - } + CONVERT_16_FWD({ + vst1_lane_s16(&dst[i], vreinterpret_s16_s32(vcvt_n_s32_f32(vld1_dup_f32(&src[i]), 31)), 1); + }, { + float32x4_t floats0 = vld1q_f32(&src[i]); + float32x4_t floats1 = vld1q_f32(&src[i + 4]); + float32x4_t floats2 = vld1q_f32(&src[i + 8]); + float32x4_t floats3 = vld1q_f32(&src[i + 12]); - SDL_assert(!i || !(((size_t)dst) & 15)); + int32x4_t ints0 = vcvtq_n_s32_f32(floats0, 31); + int32x4_t ints1 = vcvtq_n_s32_f32(floats1, 31); + int32x4_t ints2 = vcvtq_n_s32_f32(floats2, 31); + int32x4_t ints3 = vcvtq_n_s32_f32(floats3, 31); - // Make sure src is aligned too. - if (!(((size_t)src) & 15)) { - // Aligned! Do NEON blocks as long as we have 16 bytes available. - const float32x4_t one = vdupq_n_f32(1.0f); - const float32x4_t negone = vdupq_n_f32(-1.0f); - const float32x4_t mulby32767 = vdupq_n_f32(32767.0f); - int16_t *mmdst = (int16_t *)dst; - while (i >= 8) { // 8 * float32 - const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby32767)); // load 4 floats, clamp, convert to sint32 - const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src + 4)), one), mulby32767)); // load 4 floats, clamp, convert to sint32 - vst1q_s16(mmdst, vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2))); // narrow to sint16, combine, store out. - i -= 8; - src += 8; - mmdst += 8; - } - dst = (Sint16 *)mmdst; - } + int16x8_t shorts0 = vcombine_s16(vshrn_n_s32(ints0, 16), vshrn_n_s32(ints1, 16)); + int16x8_t shorts1 = vcombine_s16(vshrn_n_s32(ints2, 16), vshrn_n_s32(ints3, 16)); - // Finish off any leftovers with scalar operations. - while (i) { - const float sample = *src; - if (sample >= 1.0f) { - *dst = 32767; - } else if (sample <= -1.0f) { - *dst = -32768; - } else { - *dst = (Sint16)(sample * 32767.0f); - } - i--; - src++; - dst++; - } + vst1q_s16(&dst[i], shorts0); + vst1q_s16(&dst[i + 8], shorts1); + }) } static void SDL_Convert_F32_to_S32_NEON(Sint32 *dst, const float *src, int num_samples) { - int i; - LOG_DEBUG_AUDIO_CONVERT("F32", "S32 (using NEON)"); - // Get dst aligned to 16 bytes - for (i = num_samples; i && (((size_t)dst) & 15); --i, ++src, ++dst) { - const float sample = *src; - if (sample >= 1.0f) { - *dst = 2147483647; - } else if (sample <= -1.0f) { - *dst = (-2147483647) - 1; - } else { - *dst = ((Sint32)(sample * 8388607.0f)) << 8; - } - } + CONVERT_16_FWD({ + vst1_lane_s32(&dst[i], vcvt_n_s32_f32(vld1_dup_f32(&src[i]), 31), 0); + }, { + float32x4_t floats0 = vld1q_f32(&src[i]); + float32x4_t floats1 = vld1q_f32(&src[i + 4]); + float32x4_t floats2 = vld1q_f32(&src[i + 8]); + float32x4_t floats3 = vld1q_f32(&src[i + 12]); - SDL_assert(!i || !(((size_t)dst) & 15)); - SDL_assert(!i || !(((size_t)src) & 15)); + int32x4_t ints0 = vcvtq_n_s32_f32(floats0, 31); + int32x4_t ints1 = vcvtq_n_s32_f32(floats1, 31); + int32x4_t ints2 = vcvtq_n_s32_f32(floats2, 31); + int32x4_t ints3 = vcvtq_n_s32_f32(floats3, 31); - { - // Aligned! Do NEON blocks as long as we have 16 bytes available. - const float32x4_t one = vdupq_n_f32(1.0f); - const float32x4_t negone = vdupq_n_f32(-1.0f); - const float32x4_t mulby8388607 = vdupq_n_f32(8388607.0f); - int32_t *mmdst = (int32_t *)dst; - while (i >= 4) { // 4 * float32 - vst1q_s32(mmdst, vshlq_n_s32(vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby8388607)), 8)); - i -= 4; - src += 4; - mmdst += 4; - } - dst = (Sint32 *)mmdst; - } - - // Finish off any leftovers with scalar operations. - while (i) { - const float sample = *src; - if (sample >= 1.0f) { - *dst = 2147483647; - } else if (sample <= -1.0f) { - *dst = (-2147483647) - 1; - } else { - *dst = ((Sint32)(sample * 8388607.0f)) << 8; - } - i--; - src++; - dst++; - } + vst1q_s32(&dst[i], ints0); + vst1q_s32(&dst[i + 4], ints1); + vst1q_s32(&dst[i + 8], ints2); + vst1q_s32(&dst[i + 12], ints3); + }) } #endif