diff --git a/src/audio/SDL_audiocvt.c b/src/audio/SDL_audiocvt.c
index 57e33e6f5..e51a7daba 100644
--- a/src/audio/SDL_audiocvt.c
+++ b/src/audio/SDL_audiocvt.c
@@ -132,59 +132,6 @@ static void SDL_TARGETING("sse") SDL_ConvertMonoToStereo_SSE(float *dst, const f
 // Include the autogenerated channel converters...
 #include "SDL_audio_channel_converters.h"
 
-
-static void AudioConvertByteswap(void *dst, const void *src, int num_samples, int bitsize)
-{
-#if DEBUG_AUDIO_CONVERT
-    SDL_Log("SDL_AUDIO_CONVERT: Converting %d-bit byte order", bitsize);
-#endif
-
-    switch (bitsize) {
-#define CASESWAP(b) \
-    case b: { \
-        const Uint##b *tsrc = (const Uint##b *)src; \
-        Uint##b *tdst = (Uint##b *)dst; \
-        for (int i = 0; i < num_samples; i++) { \
-            tdst[i] = SDL_Swap##b(tsrc[i]); \
-        } \
-        break; \
-    }
-
-        CASESWAP(16);
-        CASESWAP(32);
-
-#undef CASESWAP
-
-    default:
-        SDL_assert(!"unhandled byteswap datatype!");
-        break;
-    }
-}
-
-static void AudioConvertToFloat(float *dst, const void *src, int num_samples, SDL_AudioFormat src_fmt)
-{
-    // Endian conversion is handled separately
-    switch (src_fmt & ~SDL_AUDIO_MASK_BIG_ENDIAN) {
-        case SDL_AUDIO_S8: SDL_Convert_S8_to_F32(dst, (const Sint8 *) src, num_samples); break;
-        case SDL_AUDIO_U8: SDL_Convert_U8_to_F32(dst, (const Uint8 *) src, num_samples); break;
-        case SDL_AUDIO_S16LE: SDL_Convert_S16_to_F32(dst, (const Sint16 *) src, num_samples); break;
-        case SDL_AUDIO_S32LE: SDL_Convert_S32_to_F32(dst, (const Sint32 *) src, num_samples); break;
-        default: SDL_assert(!"Unexpected audio format!"); break;
-    }
-}
-
-static void AudioConvertFromFloat(void *dst, const float *src, int num_samples, SDL_AudioFormat dst_fmt)
-{
-    // Endian conversion is handled separately
-    switch (dst_fmt & ~SDL_AUDIO_MASK_BIG_ENDIAN) {
-        case SDL_AUDIO_S8: SDL_Convert_F32_to_S8((Sint8 *) dst, src, num_samples); break;
-        case SDL_AUDIO_U8: SDL_Convert_F32_to_U8((Uint8 *) dst, src, num_samples); break;
-        case SDL_AUDIO_S16LE: SDL_Convert_F32_to_S16((Sint16 *) dst, src, num_samples); break;
-        case SDL_AUDIO_S32LE: SDL_Convert_F32_to_S32((Sint32 *) dst, src, num_samples); break;
-        default: SDL_assert(!"Unexpected audio format!"); break;
-    }
-}
-
 static SDL_bool SDL_IsSupportedAudioFormat(const SDL_AudioFormat fmt)
 {
     switch (fmt) {
@@ -210,7 +157,6 @@ static SDL_bool SDL_IsSupportedChannelCount(const int channels)
     return ((channels >= 1) && (channels <= 8));
 }
 
-
 // This does type and channel conversions _but not resampling_ (resampling happens in SDL_AudioStream).
 // This does not check parameter validity, (beyond asserts), it expects you did that already!
 // All of this has to function as if src==dst==scratch (conversion in-place), but as a convenience
@@ -266,14 +212,8 @@ void ConvertAudio(int num_frames, const void *src, SDL_AudioFormat src_format, i
         }
 
         // just a byteswap needed?
-        if ((src_format & ~SDL_AUDIO_MASK_BIG_ENDIAN) == (dst_format & ~SDL_AUDIO_MASK_BIG_ENDIAN)) {
-            if (src_bitsize == 8) {
-                if (src != dst) {
-                    SDL_memcpy(dst, src, num_frames * dst_sample_frame_size);
-                }
-                return;  // nothing to do, it's a 1-byte format.
-            }
-            AudioConvertByteswap(dst, src, num_frames * src_channels, src_bitsize);
+        if ((src_format ^ dst_format) == SDL_AUDIO_MASK_BIG_ENDIAN) {
+            ConvertAudioSwapEndian(dst, src, num_frames * src_channels, src_bitsize);
             return;  // all done.
         }
     }
@@ -282,23 +222,14 @@ void ConvertAudio(int num_frames, const void *src, SDL_AudioFormat src_format, i
         scratch = dst;
     }
 
-    const SDL_bool srcbyteswap = (SDL_AUDIO_ISBIGENDIAN(src_format) != 0) == (SDL_BYTEORDER == SDL_LIL_ENDIAN) && (src_bitsize > 8);
-    const SDL_bool srcconvert = !SDL_AUDIO_ISFLOAT(src_format);
+    const SDL_bool srcconvert = src_format != SDL_AUDIO_F32;
     const SDL_bool channelconvert = src_channels != dst_channels;
-    const SDL_bool dstconvert = !SDL_AUDIO_ISFLOAT(dst_format);
-    const SDL_bool dstbyteswap = (SDL_AUDIO_ISBIGENDIAN(dst_format) != 0) == (SDL_BYTEORDER == SDL_LIL_ENDIAN) && (dst_bitsize > 8);
-
-    // make sure we're in native byte order.
-    if (srcbyteswap) {
-        // No point writing straight to dst. If we only need a byteswap, we wouldn't be bere.
-        AudioConvertByteswap(scratch, src, num_frames * src_channels, src_bitsize);
-        src = scratch;
-    }
+    const SDL_bool dstconvert = dst_format != SDL_AUDIO_F32;
 
     // get us to float format.
     if (srcconvert) {
-        void* buf = (channelconvert || dstconvert || dstbyteswap) ? scratch : dst;
-        AudioConvertToFloat((float *) buf, src, num_frames * src_channels, src_format);
+        void* buf = (channelconvert || dstconvert) ? scratch : dst;
+        ConvertAudioToFloat((float *) buf, src, num_frames * src_channels, src_format);
         src = buf;
     }
 
@@ -330,7 +261,7 @@ void ConvertAudio(int num_frames, const void *src, SDL_AudioFormat src_format, i
             channel_converter = override;
         }
 
-        void* buf = (dstconvert || dstbyteswap) ? scratch : dst;
+        void* buf = dstconvert ? scratch : dst;
         channel_converter((float *) buf, (const float *) src, num_frames);
         src = buf;
     }
@@ -339,16 +270,10 @@ void ConvertAudio(int num_frames, const void *src, SDL_AudioFormat src_format, i
 
     // Move to final data type.
     if (dstconvert) {
-        AudioConvertFromFloat(dst, (const float *) src, num_frames * dst_channels, dst_format);
+        ConvertAudioFromFloat(dst, (const float *) src, num_frames * dst_channels, dst_format);
         src = dst;
     }
 
-    // make sure we're in final byte order.
-    if (dstbyteswap) {
-        AudioConvertByteswap(dst, src, num_frames * dst_channels, dst_bitsize);
-        src = dst;  // we've written to dst, future work will convert in-place.
-    }
-
     SDL_assert(src == dst);  // if we got here, we _had_ to have done _something_. Otherwise, we should have memcpy'd!
 }
 
diff --git a/src/audio/SDL_audiotypecvt.c b/src/audio/SDL_audiotypecvt.c
index 23ad9d443..0478d86d7 100644
--- a/src/audio/SDL_audiotypecvt.c
+++ b/src/audio/SDL_audiotypecvt.c
@@ -181,6 +181,24 @@ static void SDL_Convert_F32_to_S32_Scalar(Sint32 *dst, const float *src, int num
 
 #undef SIGNMASK
 
+static void SDL_Convert_Swap16_Scalar(Uint16* dst, const Uint16* src, int num_samples)
+{
+    int i;
+
+    for (i = 0; i < num_samples; ++i) {
+        dst[i] = SDL_Swap16(src[i]);
+    }
+}
+
+static void SDL_Convert_Swap32_Scalar(Uint32* dst, const Uint32* src, int num_samples)
+{
+    int i;
+
+    for (i = 0; i < num_samples; ++i) {
+        dst[i] = SDL_Swap32(src[i]);
+    }
+}
+
 // end fallback scalar converters
 
 // Convert forwards, when sizeof(*src) >= sizeof(*dst)
@@ -463,6 +481,51 @@ static void SDL_TARGETING("sse2") SDL_Convert_F32_to_S32_SSE2(Sint32 *dst, const
 }
 #endif
 
+// FIXME: SDL doesn't have SSSE3 detection, so use the next one up
+#ifdef SDL_SSE4_1_INTRINSICS
+static void SDL_TARGETING("ssse3") SDL_Convert_Swap16_SSSE3(Uint16* dst, const Uint16* src, int num_samples)
+{
+    const __m128i shuffle = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+
+    CONVERT_16_FWD({
+        dst[i] = SDL_Swap16(src[i]);
+    }, {
+        __m128i ints0 = _mm_loadu_si128((const __m128i*)&src[i]);
+        __m128i ints1 = _mm_loadu_si128((const __m128i*)&src[i + 8]);
+
+        ints0 = _mm_shuffle_epi8(ints0, shuffle);
+        ints1 = _mm_shuffle_epi8(ints1, shuffle);
+
+        _mm_store_si128((__m128i*)&dst[i], ints0);
+        _mm_store_si128((__m128i*)&dst[i + 8], ints1);
+    })
+}
+
+static void SDL_TARGETING("ssse3") SDL_Convert_Swap32_SSSE3(Uint32* dst, const Uint32* src, int num_samples)
+{
+    const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
+
+    CONVERT_16_FWD({
+        dst[i] = SDL_Swap32(src[i]);
+    }, {
+        __m128i ints0 = _mm_loadu_si128((const __m128i*)&src[i]);
+        __m128i ints1 = _mm_loadu_si128((const __m128i*)&src[i + 4]);
+        __m128i ints2 = _mm_loadu_si128((const __m128i*)&src[i + 8]);
+        __m128i ints3 = _mm_loadu_si128((const __m128i*)&src[i + 12]);
+
+        ints0 = _mm_shuffle_epi8(ints0, shuffle);
+        ints1 = _mm_shuffle_epi8(ints1, shuffle);
+        ints2 = _mm_shuffle_epi8(ints2, shuffle);
+        ints3 = _mm_shuffle_epi8(ints3, shuffle);
+
+        _mm_store_si128((__m128i*)&dst[i], ints0);
+        _mm_store_si128((__m128i*)&dst[i + 4], ints1);
+        _mm_store_si128((__m128i*)&dst[i + 8], ints2);
+        _mm_store_si128((__m128i*)&dst[i + 12], ints3);
+    })
+}
+#endif
+
 #ifdef SDL_NEON_INTRINSICS
 static void SDL_Convert_S8_to_F32_NEON(float *dst, const Sint8 *src, int num_samples)
 {
@@ -666,17 +729,144 @@ static void SDL_Convert_F32_to_S32_NEON(Sint32 *dst, const float *src, int num_s
         vst1q_s32(&dst[i + 12], ints3);
     })
 }
+
+static void SDL_Convert_Swap16_NEON(Uint16* dst, const Uint16* src, int num_samples)
+{
+    CONVERT_16_FWD({
+        dst[i] = SDL_Swap16(src[i]);
+    }, {
+        uint8x16_t ints0 = vld1q_u8((const Uint8*)&src[i]);
+        uint8x16_t ints1 = vld1q_u8((const Uint8*)&src[i + 8]);
+
+        ints0 = vrev16q_u8(ints0);
+        ints1 = vrev16q_u8(ints1);
+
+        vst1q_u8((Uint8*)&dst[i], ints0);
+        vst1q_u8((Uint8*)&dst[i + 8], ints1);
+    })
+}
+
+static void SDL_Convert_Swap32_NEON(Uint32* dst, const Uint32* src, int num_samples)
+{
+    CONVERT_16_FWD({
+        dst[i] = SDL_Swap32(src[i]);
+    }, {
+        uint8x16_t ints0 = vld1q_u8((const Uint8*)&src[i]);
+        uint8x16_t ints1 = vld1q_u8((const Uint8*)&src[i + 4]);
+        uint8x16_t ints2 = vld1q_u8((const Uint8*)&src[i + 8]);
+        uint8x16_t ints3 = vld1q_u8((const Uint8*)&src[i + 12]);
+
+        ints0 = vrev32q_u8(ints0);
+        ints1 = vrev32q_u8(ints1);
+        ints2 = vrev32q_u8(ints2);
+        ints3 = vrev32q_u8(ints3);
+
+        vst1q_u8((Uint8*)&dst[i], ints0);
+        vst1q_u8((Uint8*)&dst[i + 4], ints1);
+        vst1q_u8((Uint8*)&dst[i + 8], ints2);
+        vst1q_u8((Uint8*)&dst[i + 12], ints3);
+    })
+}
 #endif
 
+#undef CONVERT_16_FWD
+#undef CONVERT_16_REV
+
 // Function pointers set to a CPU-specific implementation.
-void (*SDL_Convert_S8_to_F32)(float *dst, const Sint8 *src, int num_samples) = NULL;
-void (*SDL_Convert_U8_to_F32)(float *dst, const Uint8 *src, int num_samples) = NULL;
-void (*SDL_Convert_S16_to_F32)(float *dst, const Sint16 *src, int num_samples) = NULL;
-void (*SDL_Convert_S32_to_F32)(float *dst, const Sint32 *src, int num_samples) = NULL;
-void (*SDL_Convert_F32_to_S8)(Sint8 *dst, const float *src, int num_samples) = NULL;
-void (*SDL_Convert_F32_to_U8)(Uint8 *dst, const float *src, int num_samples) = NULL;
-void (*SDL_Convert_F32_to_S16)(Sint16 *dst, const float *src, int num_samples) = NULL;
-void (*SDL_Convert_F32_to_S32)(Sint32 *dst, const float *src, int num_samples) = NULL;
+static void (*SDL_Convert_S8_to_F32)(float *dst, const Sint8 *src, int num_samples) = NULL;
+static void (*SDL_Convert_U8_to_F32)(float *dst, const Uint8 *src, int num_samples) = NULL;
+static void (*SDL_Convert_S16_to_F32)(float *dst, const Sint16 *src, int num_samples) = NULL;
+static void (*SDL_Convert_S32_to_F32)(float *dst, const Sint32 *src, int num_samples) = NULL;
+static void (*SDL_Convert_F32_to_S8)(Sint8 *dst, const float *src, int num_samples) = NULL;
+static void (*SDL_Convert_F32_to_U8)(Uint8 *dst, const float *src, int num_samples) = NULL;
+static void (*SDL_Convert_F32_to_S16)(Sint16 *dst, const float *src, int num_samples) = NULL;
+static void (*SDL_Convert_F32_to_S32)(Sint32 *dst, const float *src, int num_samples) = NULL;
+
+static void (*SDL_Convert_Swap16)(Uint16* dst, const Uint16* src, int num_samples) = NULL;
+static void (*SDL_Convert_Swap32)(Uint32* dst, const Uint32* src, int num_samples) = NULL;
+
+void ConvertAudioToFloat(float *dst, const void *src, int num_samples, SDL_AudioFormat src_fmt)
+{
+    switch (src_fmt) {
+        case SDL_AUDIO_S8:
+            SDL_Convert_S8_to_F32(dst, (const Sint8 *) src, num_samples);
+            break;
+
+        case SDL_AUDIO_U8:
+            SDL_Convert_U8_to_F32(dst, (const Uint8 *) src, num_samples);
+            break;
+
+        case SDL_AUDIO_S16:
+            SDL_Convert_S16_to_F32(dst, (const Sint16 *) src, num_samples);
+            break;
+
+        case SDL_AUDIO_S16 ^ SDL_AUDIO_MASK_BIG_ENDIAN:
+            SDL_Convert_Swap16((Uint16*) dst, (const Uint16*) src, num_samples);
+            SDL_Convert_S16_to_F32(dst, (const Sint16 *) dst, num_samples);
+            break;
+
+        case SDL_AUDIO_S32:
+            SDL_Convert_S32_to_F32(dst, (const Sint32 *) src, num_samples);
+            break;
+
+        case SDL_AUDIO_S32 ^ SDL_AUDIO_MASK_BIG_ENDIAN:
+            SDL_Convert_Swap32((Uint32*) dst, (const Uint32*) src, num_samples);
+            SDL_Convert_S32_to_F32(dst, (const Sint32 *) dst, num_samples);
+            break;
+
+        case SDL_AUDIO_F32 ^ SDL_AUDIO_MASK_BIG_ENDIAN:
+            SDL_Convert_Swap32((Uint32*) dst, (const Uint32*) src, num_samples);
+            break;
+
+        default: SDL_assert(!"Unexpected audio format!"); break;
+    }
+}
+
+void ConvertAudioFromFloat(void *dst, const float *src, int num_samples, SDL_AudioFormat dst_fmt)
+{
+    switch (dst_fmt) {
+        case SDL_AUDIO_S8:
+            SDL_Convert_F32_to_S8((Sint8 *) dst, src, num_samples);
+            break;
+
+        case SDL_AUDIO_U8:
+            SDL_Convert_F32_to_U8((Uint8 *) dst, src, num_samples);
+            break;
+
+        case SDL_AUDIO_S16:
+            SDL_Convert_F32_to_S16((Sint16 *) dst, src, num_samples);
+            break;
+
+        case SDL_AUDIO_S16 ^ SDL_AUDIO_MASK_BIG_ENDIAN:
+            SDL_Convert_F32_to_S16((Sint16 *) dst, src, num_samples);
+            SDL_Convert_Swap16((Uint16*) dst, (const Uint16*) dst, num_samples);
+            break;
+
+        case SDL_AUDIO_S32:
+            SDL_Convert_F32_to_S32((Sint32 *) dst, src, num_samples);
+            break;
+
+        case SDL_AUDIO_S32 ^ SDL_AUDIO_MASK_BIG_ENDIAN:
+            SDL_Convert_F32_to_S32((Sint32 *) dst, src, num_samples);
+            SDL_Convert_Swap32((Uint32*) dst, (const Uint32*) dst, num_samples);
+            break;
+
+        case SDL_AUDIO_F32 ^ SDL_AUDIO_MASK_BIG_ENDIAN:
+            SDL_Convert_Swap32((Uint32*) dst, (const Uint32*) src, num_samples);
+            break;
+
+        default: SDL_assert(!"Unexpected audio format!"); break;
+    }
+}
+
+void ConvertAudioSwapEndian(void* dst, const void* src, int num_samples, int bitsize)
+{
+    switch (bitsize) {
+        case 16: SDL_Convert_Swap16((Uint16*) dst, (const Uint16*) src, num_samples); break;
+        case 32: SDL_Convert_Swap32((Uint32*) dst, (const Uint32*) src, num_samples); break;
+        default: SDL_assert(!"Unexpected audio format!"); break;
+    }
+}
 
 void SDL_ChooseAudioConverters(void)
 {
@@ -685,6 +875,26 @@ void SDL_ChooseAudioConverters(void)
         return;
     }
 
+#define SET_CONVERTER_FUNCS(fntype) \
+    SDL_Convert_Swap16 = SDL_Convert_Swap16_##fntype; \
+    SDL_Convert_Swap32 = SDL_Convert_Swap32_##fntype;
+
+#ifdef SDL_SSE4_1_INTRINSICS
+    if (SDL_HasSSE41()) {
+        SET_CONVERTER_FUNCS(SSSE3);
+    } else
+#endif
+#ifdef SDL_NEON_INTRINSICS
+    if (SDL_HasNEON()) {
+        SET_CONVERTER_FUNCS(NEON);
+    } else
+#endif
+    {
+        SET_CONVERTER_FUNCS(Scalar);
+    }
+
+#undef SET_CONVERTER_FUNCS
+
 #define SET_CONVERTER_FUNCS(fntype) \
     SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \
     SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \
@@ -694,25 +904,22 @@ void SDL_ChooseAudioConverters(void)
     SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \
     SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \
     SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \
-    converters_chosen = SDL_TRUE
 
 #ifdef SDL_SSE2_INTRINSICS
     if (SDL_HasSSE2()) {
         SET_CONVERTER_FUNCS(SSE2);
-        return;
-    }
+    } else
 #endif
-
 #ifdef SDL_NEON_INTRINSICS
     if (SDL_HasNEON()) {
         SET_CONVERTER_FUNCS(NEON);
-        return;
-    }
+    } else
 #endif
-
-    SET_CONVERTER_FUNCS(Scalar);
+    {
+        SET_CONVERTER_FUNCS(Scalar);
+    }
 
 #undef SET_CONVERTER_FUNCS
 
-    SDL_assert(converters_chosen == SDL_TRUE);
+    converters_chosen = SDL_TRUE;
 }
diff --git a/src/audio/SDL_sysaudio.h b/src/audio/SDL_sysaudio.h
index 0687e8ce3..d2e243d8c 100644
--- a/src/audio/SDL_sysaudio.h
+++ b/src/audio/SDL_sysaudio.h
@@ -35,16 +35,6 @@
 #define LOG_DEBUG_AUDIO_CONVERT(from, to)
 #endif
 
-// These pointers get set during SDL_ChooseAudioConverters() to various SIMD implementations.
-extern void (*SDL_Convert_S8_to_F32)(float *dst, const Sint8 *src, int num_samples);
-extern void (*SDL_Convert_U8_to_F32)(float *dst, const Uint8 *src, int num_samples);
-extern void (*SDL_Convert_S16_to_F32)(float *dst, const Sint16 *src, int num_samples);
-extern void (*SDL_Convert_S32_to_F32)(float *dst, const Sint32 *src, int num_samples);
-extern void (*SDL_Convert_F32_to_S8)(Sint8 *dst, const float *src, int num_samples);
-extern void (*SDL_Convert_F32_to_U8)(Uint8 *dst, const float *src, int num_samples);
-extern void (*SDL_Convert_F32_to_S16)(Sint16 *dst, const float *src, int num_samples);
-extern void (*SDL_Convert_F32_to_S32)(Sint32 *dst, const float *src, int num_samples);
-
 // !!! FIXME: These are wordy and unlocalized...
 #define DEFAULT_OUTPUT_DEVNAME "System audio output device"
 #define DEFAULT_INPUT_DEVNAME  "System audio capture device"
@@ -119,6 +109,10 @@ extern SDL_bool SDL_CaptureAudioThreadIterate(SDL_AudioDevice *device);
 extern void SDL_CaptureAudioThreadShutdown(SDL_AudioDevice *device);
 extern void SDL_AudioThreadFinalize(SDL_AudioDevice *device);
 
+extern void ConvertAudioToFloat(float *dst, const void *src, int num_samples, SDL_AudioFormat src_fmt);
+extern void ConvertAudioFromFloat(void *dst, const float *src, int num_samples, SDL_AudioFormat dst_fmt);
+extern void ConvertAudioSwapEndian(void* dst, const void* src, int num_samples, int bitsize);
+
 // this gets used from the audio device threads. It has rules, don't use this if you don't know how to use it!
 extern void ConvertAudio(int num_frames, const void *src, SDL_AudioFormat src_format, int src_channels,
                          void *dst, SDL_AudioFormat dst_format, int dst_channels, void* scratch);