diff --git a/CMakeLists.txt b/CMakeLists.txt index ccc0ee23e..13af6c4d7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -735,7 +735,6 @@ if(SDL_ASSEMBLY) cmake_pop_check_state() if(CPU_SUPPORTS_AVX) set(HAVE_AVX TRUE) - target_compile_options(sdl-build-options INTERFACE "-mavx") endif() endif() @@ -760,7 +759,6 @@ if(SDL_ASSEMBLY) cmake_pop_check_state() if(CPU_SUPPORTS_MMX) set(HAVE_MMX TRUE) - target_compile_options(sdl-build-options INTERFACE "-mmmx") endif() endif() @@ -785,7 +783,6 @@ if(SDL_ASSEMBLY) cmake_pop_check_state() if(CPU_SUPPORTS_SSE) set(HAVE_SSE ON) - target_compile_options(sdl-build-options INTERFACE "-msse") endif() endif() @@ -810,7 +807,6 @@ if(SDL_ASSEMBLY) cmake_pop_check_state() if(CPU_SUPPORTS_SSE2) set(HAVE_SSE2 TRUE) - target_compile_options(sdl-build-options INTERFACE "-msse2") endif() endif() @@ -835,7 +831,6 @@ if(SDL_ASSEMBLY) cmake_pop_check_state() if(CPU_SUPPORTS_SSE3) set(HAVE_SSE3 TRUE) - target_compile_options(sdl-build-options INTERFACE "-msse3") endif() endif() diff --git a/include/SDL3/SDL_intrin.h b/include/SDL3/SDL_intrin.h index 6fedc18b2..adec1346d 100644 --- a/include/SDL3/SDL_intrin.h +++ b/include/SDL3/SDL_intrin.h @@ -93,25 +93,39 @@ _m_prefetch(void *__P) #endif #endif /* compiler version */ +#if defined(__clang__) && defined(__has_attribute) +# if __has_attribute(target) +# define SDL_HAS_TARGET_ATTRIBS +# endif +#elif defined(__GNUC__) && (__GNUC__ + (__GNUC_MINOR__ >= 9) > 4) /* gcc >= 4.9 */ +# define SDL_HAS_TARGET_ATTRIBS +#endif + +#ifdef SDL_HAS_TARGET_ATTRIBS +# define SDL_TARGETING(x) __attribute__((target(x))) +#else +# define SDL_TARGETING(x) +#endif + #if defined(__loongarch_sx) && !defined(SDL_DISABLE_LSX) #include #endif #if defined(__loongarch_asx) && !defined(SDL_DISABLE_LASX) #include #endif -#if defined(__AVX__) && !defined(SDL_DISABLE_AVX) +#if (defined(__AVX__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_AVX) #include #endif -#if defined(__MMX__) && !defined(SDL_DISABLE_MMX) +#if (defined(__MMX__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_MMX) #include #endif -#if defined(__SSE__) && !defined(SDL_DISABLE_SSE) +#if (defined(__SSE__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE) #include #endif -#if defined(__SSE2__) && !defined(SDL_DISABLE_SSE2) +#if (defined(__SSE2__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE2) #include #endif -#if defined(__SSE3__) && !defined(SDL_DISABLE_SSE3) +#if (defined(__SSE3__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE3) #include #endif diff --git a/src/SDL_internal.h b/src/SDL_internal.h index 89c6f224e..fd65cf864 100644 --- a/src/SDL_internal.h +++ b/src/SDL_internal.h @@ -194,23 +194,23 @@ #define HAVE_NEON_INTRINSICS 1 #endif -#if defined(__MMX__) && !defined(SDL_DISABLE_MMX) +#if (defined(__MMX__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_MMX) #define HAVE_MMX_INTRINSICS 1 #endif -#if defined(__SSE__) && !defined(SDL_DISABLE_SSE) +#if (defined(__SSE__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE) #define HAVE_SSE_INTRINSICS 1 #endif -#if defined(__SSE2__) && !defined(SDL_DISABLE_SSE2) +#if (defined(__SSE2__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE2) #define HAVE_SSE2_INTRINSICS 1 #endif -#if defined(__SSE3__) && !defined(SDL_DISABLE_SSE3) +#if (defined(__SSE3__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE3) #define HAVE_SSE3_INTRINSICS 1 #endif -#if defined(__AVX__) && !defined(SDL_DISABLE_AVX) +#if (defined(__AVX__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_AVX) #define HAVE_AVX_INTRINSICS 1 #endif @@ -222,19 +222,6 @@ #define HAVE_LASX_INTRINSICS 1 #endif -#if defined __clang__ -#if (!__has_attribute(target)) -#undef HAVE_AVX_INTRINSICS -#endif -#if (defined(_MSC_VER) || defined(__SCE__)) && !defined(__AVX__) -#undef HAVE_AVX_INTRINSICS -#endif -#elif defined __GNUC__ -#if (__GNUC__ < 4) || (__GNUC__ == 4 && __GNUC_MINOR__ < 9) -#undef HAVE_AVX_INTRINSICS -#endif -#endif - #define SDL_MAIN_NOIMPL /* don't drag in header-only implementation of SDL_main */ #include diff --git a/src/audio/SDL_audiocvt.c b/src/audio/SDL_audiocvt.c index c29c80358..2eb74f854 100644 --- a/src/audio/SDL_audiocvt.c +++ b/src/audio/SDL_audiocvt.c @@ -146,7 +146,7 @@ static int SDL_ConvertAudio(SDL_AudioCVT * cvt); #if HAVE_SSE3_INTRINSICS /* Convert from stereo to mono. Average left and right. */ -static void SDLCALL SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT *cvt, SDL_AudioFormat format) +static void SDLCALL SDL_TARGETING("sse3") SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT *cvt, SDL_AudioFormat format) { const __m128 divby2 = _mm_set1_ps(0.5f); float *dst = (float *)cvt->buf; @@ -183,7 +183,7 @@ static void SDLCALL SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT *cvt, SDL_AudioFor #if HAVE_SSE_INTRINSICS /* Convert from mono to stereo. Duplicate to stereo left and right. */ -static void SDLCALL SDL_ConvertMonoToStereo_SSE(SDL_AudioCVT *cvt, SDL_AudioFormat format) +static void SDLCALL SDL_TARGETING("sse") SDL_ConvertMonoToStereo_SSE(SDL_AudioCVT *cvt, SDL_AudioFormat format) { float *dst = ((float *)(cvt->buf + (cvt->len_cvt * 2))) - 8; const float *src = ((const float *)(cvt->buf + cvt->len_cvt)) - 4; diff --git a/src/audio/SDL_audiotypecvt.c b/src/audio/SDL_audiotypecvt.c index 86ab19507..e94a890dc 100644 --- a/src/audio/SDL_audiotypecvt.c +++ b/src/audio/SDL_audiotypecvt.c @@ -225,7 +225,7 @@ static void SDLCALL SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFo #endif #if HAVE_SSE2_INTRINSICS -static void SDLCALL SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) +static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) { const Sint8 *src = ((const Sint8 *)(cvt->buf + cvt->len_cvt)) - 1; float *dst = ((float *)(cvt->buf + cvt->len_cvt * 4)) - 1; @@ -289,7 +289,7 @@ static void SDLCALL SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioForma } } -static void SDLCALL SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) +static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) { const Uint8 *src = ((const Uint8 *)(cvt->buf + cvt->len_cvt)) - 1; float *dst = ((float *)(cvt->buf + cvt->len_cvt * 4)) - 1; @@ -355,7 +355,7 @@ static void SDLCALL SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioForma } } -static void SDLCALL SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) +static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) { const Sint16 *src = ((const Sint16 *)(cvt->buf + cvt->len_cvt)) - 1; float *dst = ((float *)(cvt->buf + cvt->len_cvt * 2)) - 1; @@ -408,7 +408,7 @@ static void SDLCALL SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioForm } } -static void SDLCALL SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) +static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) { const Sint32 *src = (const Sint32 *)cvt->buf; float *dst = (float *)cvt->buf; @@ -451,7 +451,7 @@ static void SDLCALL SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioForm } } -static void SDLCALL SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) +static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) { const float *src = (const float *)cvt->buf; Sint8 *dst = (Sint8 *)cvt->buf; @@ -514,7 +514,7 @@ static void SDLCALL SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioForma } } -static void SDLCALL SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) +static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) { const float *src = (const float *)cvt->buf; Uint8 *dst = cvt->buf; @@ -577,7 +577,7 @@ static void SDLCALL SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioForma } } -static void SDLCALL SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) +static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) { const float *src = (const float *)cvt->buf; Sint16 *dst = (Sint16 *)cvt->buf; @@ -638,7 +638,7 @@ static void SDLCALL SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioForm } } -static void SDLCALL SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) +static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) { const float *src = (const float *)cvt->buf; Sint32 *dst = (Sint32 *)cvt->buf; diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c index b0e20743a..a9324a3eb 100644 --- a/src/video/SDL_blit_A.c +++ b/src/video/SDL_blit_A.c @@ -169,7 +169,7 @@ static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info) #if HAVE_MMX_INTRINSICS /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ -static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info) +static void SDL_TARGETING("mmx") BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info) { int width = info->dst_w; int height = info->dst_h; @@ -223,7 +223,7 @@ static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info) } /* fast RGB888->(A)RGB888 blending with surface alpha */ -static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info) +static void SDL_TARGETING("mmx") BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info) { SDL_PixelFormat *df = info->dst_fmt; Uint32 chanmask; @@ -318,7 +318,7 @@ static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info) } /* fast ARGB888->(A)RGB888 blending with pixel alpha */ -static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info) +static void SDL_TARGETING("mmx") BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info) { int width = info->dst_w; int height = info->dst_h; @@ -753,7 +753,7 @@ static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask) #if HAVE_MMX_INTRINSICS /* fast RGB565->RGB565 blending with surface alpha */ -static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info) +static void SDL_TARGETING("mmx") Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info) { unsigned alpha = info->a; if (alpha == 128) { @@ -889,7 +889,7 @@ static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info) } /* fast RGB555->RGB555 blending with surface alpha */ -static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info) +static void SDL_TARGETING("mmx") Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info) { unsigned alpha = info->a; if (alpha == 128) { diff --git a/src/video/SDL_blit_copy.c b/src/video/SDL_blit_copy.c index 45536b4b2..cc8cd2204 100644 --- a/src/video/SDL_blit_copy.c +++ b/src/video/SDL_blit_copy.c @@ -25,7 +25,7 @@ #if HAVE_SSE_INTRINSICS /* This assumes 16-byte aligned src and dst */ -static SDL_INLINE void SDL_memcpySSE(Uint8 *dst, const Uint8 *src, int len) +static SDL_INLINE void SDL_TARGETING("sse") SDL_memcpySSE(Uint8 *dst, const Uint8 *src, int len) { int i; @@ -54,7 +54,7 @@ static SDL_INLINE void SDL_memcpySSE(Uint8 *dst, const Uint8 *src, int len) #ifdef _MSC_VER #pragma warning(disable : 4799) #endif -static SDL_INLINE void SDL_memcpyMMX(Uint8 *dst, const Uint8 *src, int len) +static SDL_INLINE void SDL_TARGETING("mmx") SDL_memcpyMMX(Uint8 *dst, const Uint8 *src, int len) { const int remain = (len & 63); int i; @@ -81,6 +81,16 @@ static SDL_INLINE void SDL_memcpyMMX(Uint8 *dst, const Uint8 *src, int len) SDL_memcpy(dst + skip, src + skip, remain); } } + +static SDL_INLINE void SDL_TARGETING("mmx") SDL_BlitCopyMMX(Uint8 *dst, const Uint8 *src, const int dstskip, const int srcskip, const int w, int h) +{ + while (h--) { + SDL_memcpyMMX(dst, src, w); + src += srcskip; + dst += dstskip; + } + _mm_empty(); +} #endif /* HAVE_MMX_INTRINSICS */ void SDL_BlitCopy(SDL_BlitInfo *info) @@ -137,12 +147,7 @@ void SDL_BlitCopy(SDL_BlitInfo *info) #if HAVE_MMX_INTRINSICS if (SDL_HasMMX() && !(srcskip & 7) && !(dstskip & 7)) { - while (h--) { - SDL_memcpyMMX(dst, src, w); - src += srcskip; - dst += dstskip; - } - _mm_empty(); + SDL_BlitCopyMMX(dst, src, w, h, dstskip, srcskip); return; } #endif diff --git a/src/video/SDL_fillrect.c b/src/video/SDL_fillrect.c index 595cf7c32..59c8128e9 100644 --- a/src/video/SDL_fillrect.c +++ b/src/video/SDL_fillrect.c @@ -55,7 +55,7 @@ #define SSE_END #define DEFINE_SSE_FILLRECT(bpp, type) \ -static void SDL_FillSurfaceRect##bpp##SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \ +static void SDL_TARGETING("sse") SDL_FillSurfaceRect##bpp##SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \ { \ int i, n; \ Uint8 *p = NULL; \ @@ -92,7 +92,7 @@ static void SDL_FillSurfaceRect##bpp##SSE(Uint8 *pixels, int pitch, Uint32 color SSE_END; \ } -static void SDL_FillSurfaceRect1SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h) +static void SDL_TARGETING("sse") SDL_FillSurfaceRect1SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h) { int i, n; diff --git a/src/video/SDL_stretch.c b/src/video/SDL_stretch.c index 36eb391a0..4b4091470 100644 --- a/src/video/SDL_stretch.c +++ b/src/video/SDL_stretch.c @@ -349,7 +349,7 @@ static int scale_mat(const Uint32 *src, int src_w, int src_h, int src_pitch, #if defined(HAVE_SSE2_INTRINSICS) #if 0 -static void printf_128(const char *str, __m128i var) +static void SDL_TARGETING("sse2") printf_128(const char *str, __m128i var) { uint16_t *val = (uint16_t*) &var; printf(" * %s: %04x %04x %04x %04x _ %04x %04x %04x %04x\n", @@ -367,7 +367,7 @@ static SDL_INLINE int hasSSE2(void) return val; } -static SDL_INLINE void INTERPOL_BILINEAR_SSE(const Uint32 *s0, const Uint32 *s1, int frac_w, __m128i v_frac_h0, __m128i v_frac_h1, Uint32 *dst, __m128i zero) +static SDL_INLINE void SDL_TARGETING("sse2") INTERPOL_BILINEAR_SSE(const Uint32 *s0, const Uint32 *s1, int frac_w, __m128i v_frac_h0, __m128i v_frac_h1, Uint32 *dst, __m128i zero) { __m128i x_00_01, x_10_11; /* Pixels in 4*uint8 in row */ __m128i v_frac_w0, k0, l0, d0, e0; @@ -404,7 +404,7 @@ static SDL_INLINE void INTERPOL_BILINEAR_SSE(const Uint32 *s0, const Uint32 *s1, *dst = _mm_cvtsi128_si32(e0); } -static int scale_mat_SSE(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch) +static int SDL_TARGETING("sse2") scale_mat_SSE(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch) { BILINEAR___START diff --git a/src/video/SDL_yuv.c b/src/video/SDL_yuv.c index 06a7fa708..7c4d6f67b 100644 --- a/src/video/SDL_yuv.c +++ b/src/video/SDL_yuv.c @@ -303,14 +303,14 @@ static int GetYUVPlanes(int width, int height, Uint32 format, const void *yuv, i return 0; } -static SDL_bool yuv_rgb_sse( +#if HAVE_SSE2_INTRINSICS +static SDL_bool SDL_TARGETING("sse2") yuv_rgb_sse( Uint32 src_format, Uint32 dst_format, Uint32 width, Uint32 height, const Uint8 *y, const Uint8 *u, const Uint8 *v, Uint32 y_stride, Uint32 uv_stride, Uint8 *rgb, Uint32 rgb_stride, YCbCrType yuv_type) { -#if HAVE_SSE2_INTRINSICS if (!SDL_HasSSE2()) { return SDL_FALSE; } @@ -408,10 +408,21 @@ static SDL_bool yuv_rgb_sse( break; } } -#endif return SDL_FALSE; } +#else +static SDL_bool yuv_rgb_sse( + Uint32 src_format, Uint32 dst_format, + Uint32 width, Uint32 height, + const Uint8 *y, const Uint8 *u, const Uint8 *v, Uint32 y_stride, Uint32 uv_stride, + Uint8 *rgb, Uint32 rgb_stride, + YCbCrType yuv_type) +{ + return SDL_FALSE; +} +#endif +#if HAVE_LSX_INTRINSICS static SDL_bool yuv_rgb_lsx( Uint32 src_format, Uint32 dst_format, Uint32 width, Uint32 height, @@ -419,7 +430,6 @@ static SDL_bool yuv_rgb_lsx( Uint8 *rgb, Uint32 rgb_stride, YCbCrType yuv_type) { -#if HAVE_LSX_INTRINSICS if (!SDL_HasLSX()) { return SDL_FALSE; } @@ -450,9 +460,19 @@ static SDL_bool yuv_rgb_lsx( break; } } -#endif return SDL_FALSE; } +#else +static SDL_bool yuv_rgb_lsx( + Uint32 src_format, Uint32 dst_format, + Uint32 width, Uint32 height, + const Uint8 *y, const Uint8 *u, const Uint8 *v, Uint32 y_stride, Uint32 uv_stride, + Uint8 *rgb, Uint32 rgb_stride, + YCbCrType yuv_type) +{ + return SDL_FALSE; +} +#endif static SDL_bool yuv_rgb_std( Uint32 src_format, Uint32 dst_format, @@ -1102,7 +1122,8 @@ static int SDL_ConvertPixels_SwapUVPlanes(int width, int height, const void *src return 0; } -static int SDL_ConvertPixels_PackUVPlanes_to_NV(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV) +#if HAVE_SSE2_INTRINSICS +static int SDL_TARGETING("sse2") SDL_ConvertPixels_PackUVPlanes_to_NV_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV) { int x, y; const int UVwidth = (width + 1) / 2; @@ -1114,9 +1135,6 @@ static int SDL_ConvertPixels_PackUVPlanes_to_NV(int width, int height, const voi const Uint8 *src1, *src2; Uint8 *dstUV; Uint8 *tmp = NULL; -#if HAVE_SSE2_INTRINSICS - const SDL_bool use_SSE2 = SDL_HasSSE2(); -#endif /* Skip the Y plane */ src = (const Uint8 *)src + height * src_pitch; @@ -1144,22 +1162,18 @@ static int SDL_ConvertPixels_PackUVPlanes_to_NV(int width, int height, const voi y = UVheight; while (y--) { x = UVwidth; -#if HAVE_SSE2_INTRINSICS - if (use_SSE2) { - while (x >= 16) { - __m128i u = _mm_loadu_si128((__m128i *)src1); - __m128i v = _mm_loadu_si128((__m128i *)src2); - __m128i uv1 = _mm_unpacklo_epi8(u, v); - __m128i uv2 = _mm_unpackhi_epi8(u, v); - _mm_storeu_si128((__m128i *)dstUV, uv1); - _mm_storeu_si128((__m128i *)(dstUV + 16), uv2); - src1 += 16; - src2 += 16; - dstUV += 32; - x -= 16; - } + while (x >= 16) { + __m128i u = _mm_loadu_si128((__m128i *)src1); + __m128i v = _mm_loadu_si128((__m128i *)src2); + __m128i uv1 = _mm_unpacklo_epi8(u, v); + __m128i uv2 = _mm_unpackhi_epi8(u, v); + _mm_storeu_si128((__m128i *)dstUV, uv1); + _mm_storeu_si128((__m128i *)(dstUV + 16), uv2); + src1 += 16; + src2 += 16; + dstUV += 32; + x -= 16; } -#endif while (x--) { *dstUV++ = *src1++; *dstUV++ = *src2++; @@ -1174,8 +1188,68 @@ static int SDL_ConvertPixels_PackUVPlanes_to_NV(int width, int height, const voi } return 0; } +#endif -static int SDL_ConvertPixels_SplitNV_to_UVPlanes(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV) +static int SDL_ConvertPixels_PackUVPlanes_to_NV(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV) +{ + if (SDL_HasSSE2()) { + return SDL_ConvertPixels_PackUVPlanes_to_NV_SSE2(width, height, src, src_pitch, dst, dst_pitch, reverseUV); + } else { + int x, y; + const int UVwidth = (width + 1) / 2; + const int UVheight = (height + 1) / 2; + const int srcUVPitch = ((src_pitch + 1) / 2); + const int srcUVPitchLeft = srcUVPitch - UVwidth; + const int dstUVPitch = ((dst_pitch + 1) / 2) * 2; + const int dstUVPitchLeft = dstUVPitch - UVwidth * 2; + const Uint8 *src1, *src2; + Uint8 *dstUV; + Uint8 *tmp = NULL; + + /* Skip the Y plane */ + src = (const Uint8 *)src + height * src_pitch; + dst = (Uint8 *)dst + height * dst_pitch; + + if (src == dst) { + /* Need to make a copy of the buffer so we don't clobber it while converting */ + tmp = (Uint8 *)SDL_malloc((size_t)2 * UVheight * srcUVPitch); + if (tmp == NULL) { + return SDL_OutOfMemory(); + } + SDL_memcpy(tmp, src, (size_t)2 * UVheight * srcUVPitch); + src = tmp; + } + + if (reverseUV) { + src2 = (const Uint8 *)src; + src1 = src2 + UVheight * srcUVPitch; + } else { + src1 = (const Uint8 *)src; + src2 = src1 + UVheight * srcUVPitch; + } + dstUV = (Uint8 *)dst; + + y = UVheight; + while (y--) { + x = UVwidth; + while (x--) { + *dstUV++ = *src1++; + *dstUV++ = *src2++; + } + src1 += srcUVPitchLeft; + src2 += srcUVPitchLeft; + dstUV += dstUVPitchLeft; + } + + if (tmp) { + SDL_free(tmp); + } + return 0; + } +} + +#if HAVE_SSE2_INTRINSICS +static int SDL_TARGETING("sse2") SDL_ConvertPixels_SplitNV_to_UVPlanes_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV) { int x, y; const int UVwidth = (width + 1) / 2; @@ -1187,10 +1261,82 @@ static int SDL_ConvertPixels_SplitNV_to_UVPlanes(int width, int height, const vo const Uint8 *srcUV; Uint8 *dst1, *dst2; Uint8 *tmp = NULL; -#if HAVE_SSE2_INTRINSICS - const SDL_bool use_SSE2 = SDL_HasSSE2(); + + /* Skip the Y plane */ + src = (const Uint8 *)src + height * src_pitch; + dst = (Uint8 *)dst + height * dst_pitch; + + if (src == dst) { + /* Need to make a copy of the buffer so we don't clobber it while converting */ + tmp = (Uint8 *)SDL_malloc((size_t)UVheight * srcUVPitch); + if (tmp == NULL) { + return SDL_OutOfMemory(); + } + SDL_memcpy(tmp, src, (size_t)UVheight * srcUVPitch); + src = tmp; + } + + if (reverseUV) { + dst2 = (Uint8 *)dst; + dst1 = dst2 + UVheight * dstUVPitch; + } else { + dst1 = (Uint8 *)dst; + dst2 = dst1 + UVheight * dstUVPitch; + } + srcUV = (const Uint8 *)src; + + y = UVheight; + while (y--) { + __m128i mask = _mm_set1_epi16(0x00FF); + x = UVwidth; + while (x >= 16) { + __m128i uv1 = _mm_loadu_si128((__m128i *)srcUV); + __m128i uv2 = _mm_loadu_si128((__m128i *)(srcUV + 16)); + __m128i u1 = _mm_and_si128(uv1, mask); + __m128i u2 = _mm_and_si128(uv2, mask); + __m128i u = _mm_packus_epi16(u1, u2); + __m128i v1 = _mm_srli_epi16(uv1, 8); + __m128i v2 = _mm_srli_epi16(uv2, 8); + __m128i v = _mm_packus_epi16(v1, v2); + _mm_storeu_si128((__m128i *)dst1, u); + _mm_storeu_si128((__m128i *)dst2, v); + srcUV += 32; + dst1 += 16; + dst2 += 16; + x -= 16; + } + while (x--) { + *dst1++ = *srcUV++; + *dst2++ = *srcUV++; + } + srcUV += srcUVPitchLeft; + dst1 += dstUVPitchLeft; + dst2 += dstUVPitchLeft; + } + + if (tmp) { + SDL_free(tmp); + } + return 0; +} #endif +static int SDL_ConvertPixels_SplitNV_to_UVPlanes(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV) +{ + if (SDL_HasSSE2()) { + return SDL_ConvertPixels_SplitNV_to_UVPlanes_SSE2(width, height, src, src_pitch, dst, dst_pitch, reverseUV); + } else { + int x, y; + const int UVwidth = (width + 1) / 2; + const int UVheight = (height + 1) / 2; + const int srcUVPitch = ((src_pitch + 1) / 2) * 2; + const int srcUVPitchLeft = srcUVPitch - UVwidth * 2; + const int dstUVPitch = ((dst_pitch + 1) / 2); + const int dstUVPitchLeft = dstUVPitch - UVwidth; + const Uint8 *srcUV; + Uint8 *dst1, *dst2; + Uint8 *tmp = NULL; + /* Skip the Y plane */ src = (const Uint8 *)src + height * src_pitch; dst = (Uint8 *)dst + height * dst_pitch; @@ -1217,27 +1363,6 @@ static int SDL_ConvertPixels_SplitNV_to_UVPlanes(int width, int height, const vo y = UVheight; while (y--) { x = UVwidth; -#if HAVE_SSE2_INTRINSICS - if (use_SSE2) { - __m128i mask = _mm_set1_epi16(0x00FF); - while (x >= 16) { - __m128i uv1 = _mm_loadu_si128((__m128i *)srcUV); - __m128i uv2 = _mm_loadu_si128((__m128i *)(srcUV + 16)); - __m128i u1 = _mm_and_si128(uv1, mask); - __m128i u2 = _mm_and_si128(uv2, mask); - __m128i u = _mm_packus_epi16(u1, u2); - __m128i v1 = _mm_srli_epi16(uv1, 8); - __m128i v2 = _mm_srli_epi16(uv2, 8); - __m128i v = _mm_packus_epi16(v1, v2); - _mm_storeu_si128((__m128i *)dst1, u); - _mm_storeu_si128((__m128i *)dst2, v); - srcUV += 32; - dst1 += 16; - dst2 += 16; - x -= 16; - } - } -#endif while (x--) { *dst1++ = *srcUV++; *dst2++ = *srcUV++; @@ -1251,9 +1376,11 @@ static int SDL_ConvertPixels_SplitNV_to_UVPlanes(int width, int height, const vo SDL_free(tmp); } return 0; + } } -static int SDL_ConvertPixels_SwapNV(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) +#if HAVE_SSE2_INTRINSICS +static int SDL_TARGETING("sse2") SDL_ConvertPixels_SwapNV_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) { int x, y; const int UVwidth = (width + 1) / 2; @@ -1264,9 +1391,6 @@ static int SDL_ConvertPixels_SwapNV(int width, int height, const void *src, int const int dstUVPitchLeft = (dstUVPitch - UVwidth * 2) / sizeof(Uint16); const Uint16 *srcUV; Uint16 *dstUV; -#if HAVE_SSE2_INTRINSICS - const SDL_bool use_SSE2 = SDL_HasSSE2(); -#endif /* Skip the Y plane */ src = (const Uint8 *)src + height * src_pitch; @@ -1277,20 +1401,16 @@ static int SDL_ConvertPixels_SwapNV(int width, int height, const void *src, int y = UVheight; while (y--) { x = UVwidth; -#if HAVE_SSE2_INTRINSICS - if (use_SSE2) { - while (x >= 8) { - __m128i uv = _mm_loadu_si128((__m128i *)srcUV); - __m128i v = _mm_slli_epi16(uv, 8); - __m128i u = _mm_srli_epi16(uv, 8); - __m128i vu = _mm_or_si128(v, u); - _mm_storeu_si128((__m128i *)dstUV, vu); - srcUV += 8; - dstUV += 8; - x -= 8; - } + while (x >= 8) { + __m128i uv = _mm_loadu_si128((__m128i *)srcUV); + __m128i v = _mm_slli_epi16(uv, 8); + __m128i u = _mm_srli_epi16(uv, 8); + __m128i vu = _mm_or_si128(v, u); + _mm_storeu_si128((__m128i *)dstUV, vu); + srcUV += 8; + dstUV += 8; + x -= 8; } -#endif while (x--) { *dstUV++ = SDL_Swap16(*srcUV++); } @@ -1299,6 +1419,41 @@ static int SDL_ConvertPixels_SwapNV(int width, int height, const void *src, int } return 0; } +#endif + +static int SDL_ConvertPixels_SwapNV(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) +{ + if (SDL_HasSSE2()) { + return SDL_ConvertPixels_SwapNV_SSE2(width, height, src, src_pitch, dst, dst_pitch); + } else { + int x, y; + const int UVwidth = (width + 1) / 2; + const int UVheight = (height + 1) / 2; + const int srcUVPitch = ((src_pitch + 1) / 2) * 2; + const int srcUVPitchLeft = (srcUVPitch - UVwidth * 2) / sizeof(Uint16); + const int dstUVPitch = ((dst_pitch + 1) / 2) * 2; + const int dstUVPitchLeft = (dstUVPitch - UVwidth * 2) / sizeof(Uint16); + const Uint16 *srcUV; + Uint16 *dstUV; + + /* Skip the Y plane */ + src = (const Uint8 *)src + height * src_pitch; + dst = (Uint8 *)dst + height * dst_pitch; + + srcUV = (const Uint16 *)src; + dstUV = (Uint16 *)dst; + y = UVheight; + while (y--) { + x = UVwidth; + while (x--) { + *dstUV++ = SDL_Swap16(*srcUV++); + } + srcUV += srcUVPitchLeft; + dstUV += dstUVPitchLeft; + } + return 0; + } +} static int SDL_ConvertPixels_Planar2x2_to_Planar2x2(int width, int height, Uint32 src_format, const void *src, int src_pitch, @@ -1389,28 +1544,232 @@ static int SDL_ConvertPixels_Planar2x2_to_Planar2x2(int width, int height, x -= 4; \ } +static int SDL_TARGETING("sse2") SDL_ConvertPixels_YUY2_to_UYVY_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) +{ + int x, y; + const int YUVwidth = (width + 1) / 2; + const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4); + const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4); + const Uint8 *srcYUV = (const Uint8 *)src; + Uint8 *dstYUV = (Uint8 *)dst; + + y = height; + x = YUVwidth; + while (y--) { + PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(2, 3, 0, 1)); + while (x--) { + Uint8 Y1, U, Y2, V; + + Y1 = srcYUV[0]; + U = srcYUV[1]; + Y2 = srcYUV[2]; + V = srcYUV[3]; + srcYUV += 4; + + dstYUV[0] = U; + dstYUV[1] = Y1; + dstYUV[2] = V; + dstYUV[3] = Y2; + dstYUV += 4; + } + srcYUV += srcYUVPitchLeft; + dstYUV += dstYUVPitchLeft; + x = YUVwidth; + } + return 0; +} + +static int SDL_TARGETING("sse2") SDL_ConvertPixels_YUY2_to_YVYU_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) +{ + int x, y; + const int YUVwidth = (width + 1) / 2; + const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4); + const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4); + const Uint8 *srcYUV = (const Uint8 *)src; + Uint8 *dstYUV = (Uint8 *)dst; + + y = height; + x = YUVwidth; + while (y--) { + PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(1, 2, 3, 0)); + while (x--) { + Uint8 Y1, U, Y2, V; + + Y1 = srcYUV[0]; + U = srcYUV[1]; + Y2 = srcYUV[2]; + V = srcYUV[3]; + srcYUV += 4; + + dstYUV[0] = Y1; + dstYUV[1] = V; + dstYUV[2] = Y2; + dstYUV[3] = U; + dstYUV += 4; + } + srcYUV += srcYUVPitchLeft; + dstYUV += dstYUVPitchLeft; + x = YUVwidth; + } + return 0; +} + +static int SDL_TARGETING("sse2") SDL_ConvertPixels_UYVY_to_YUY2_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) +{ + int x, y; + const int YUVwidth = (width + 1) / 2; + const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4); + const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4); + const Uint8 *srcYUV = (const Uint8 *)src; + Uint8 *dstYUV = (Uint8 *)dst; + + y = height; + x = YUVwidth; + while (y--) { + PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(2, 3, 0, 1)); + while (x--) { + Uint8 Y1, U, Y2, V; + + U = srcYUV[0]; + Y1 = srcYUV[1]; + V = srcYUV[2]; + Y2 = srcYUV[3]; + srcYUV += 4; + + dstYUV[0] = Y1; + dstYUV[1] = U; + dstYUV[2] = Y2; + dstYUV[3] = V; + dstYUV += 4; + } + srcYUV += srcYUVPitchLeft; + dstYUV += dstYUVPitchLeft; + x = YUVwidth; + } + return 0; +} + +static int SDL_TARGETING("sse2") SDL_ConvertPixels_UYVY_to_YVYU_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) +{ + int x, y; + const int YUVwidth = (width + 1) / 2; + const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4); + const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4); + const Uint8 *srcYUV = (const Uint8 *)src; + Uint8 *dstYUV = (Uint8 *)dst; + + y = height; + x = YUVwidth; + while (y--) { + PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(0, 3, 2, 1)); + while (x--) { + Uint8 Y1, U, Y2, V; + + U = srcYUV[0]; + Y1 = srcYUV[1]; + V = srcYUV[2]; + Y2 = srcYUV[3]; + srcYUV += 4; + + dstYUV[0] = Y1; + dstYUV[1] = V; + dstYUV[2] = Y2; + dstYUV[3] = U; + dstYUV += 4; + } + srcYUV += srcYUVPitchLeft; + dstYUV += dstYUVPitchLeft; + x = YUVwidth; + } + return 0; +} + +static int SDL_TARGETING("sse2") SDL_ConvertPixels_YVYU_to_YUY2_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) +{ + int x, y; + const int YUVwidth = (width + 1) / 2; + const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4); + const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4); + const Uint8 *srcYUV = (const Uint8 *)src; + Uint8 *dstYUV = (Uint8 *)dst; + + y = height; + x = YUVwidth; + while (y--) { + PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(1, 2, 3, 0)); + while (x--) { + Uint8 Y1, U, Y2, V; + + Y1 = srcYUV[0]; + V = srcYUV[1]; + Y2 = srcYUV[2]; + U = srcYUV[3]; + srcYUV += 4; + + dstYUV[0] = Y1; + dstYUV[1] = U; + dstYUV[2] = Y2; + dstYUV[3] = V; + dstYUV += 4; + } + srcYUV += srcYUVPitchLeft; + dstYUV += dstYUVPitchLeft; + x = YUVwidth; + } + return 0; +} + +static int SDL_TARGETING("sse2") SDL_ConvertPixels_YVYU_to_UYVY_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) +{ + int x, y; + const int YUVwidth = (width + 1) / 2; + const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4); + const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4); + const Uint8 *srcYUV = (const Uint8 *)src; + Uint8 *dstYUV = (Uint8 *)dst; + + y = height; + x = YUVwidth; + while (y--) { + PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(2, 1, 0, 3)); + while (x--) { + Uint8 Y1, U, Y2, V; + + Y1 = srcYUV[0]; + V = srcYUV[1]; + Y2 = srcYUV[2]; + U = srcYUV[3]; + srcYUV += 4; + + dstYUV[0] = U; + dstYUV[1] = Y1; + dstYUV[2] = V; + dstYUV[3] = Y2; + dstYUV += 4; + } + srcYUV += srcYUVPitchLeft; + dstYUV += dstYUVPitchLeft; + x = YUVwidth; + } + return 0; +} #endif static int SDL_ConvertPixels_YUY2_to_UYVY(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) { + if (SDL_HasSSE2()) { + return SDL_ConvertPixels_YUY2_to_UYVY_SSE2(width, height, src, src_pitch, dst, dst_pitch); + } else { int x, y; const int YUVwidth = (width + 1) / 2; const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4); const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4); const Uint8 *srcYUV = (const Uint8 *)src; Uint8 *dstYUV = (Uint8 *)dst; -#if HAVE_SSE2_INTRINSICS - const SDL_bool use_SSE2 = SDL_HasSSE2(); -#endif y = height; while (y--) { x = YUVwidth; -#if HAVE_SSE2_INTRINSICS - if (use_SSE2) { - PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(2, 3, 0, 1)); - } -#endif while (x--) { Uint8 Y1, U, Y2, V; @@ -1430,28 +1789,24 @@ static int SDL_ConvertPixels_YUY2_to_UYVY(int width, int height, const void *src dstYUV += dstYUVPitchLeft; } return 0; + } } static int SDL_ConvertPixels_YUY2_to_YVYU(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) { + if (SDL_HasSSE2()) { + return SDL_ConvertPixels_YUY2_to_YVYU_SSE2(width, height, src, src_pitch, dst, dst_pitch); + } else { int x, y; const int YUVwidth = (width + 1) / 2; const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4); const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4); const Uint8 *srcYUV = (const Uint8 *)src; Uint8 *dstYUV = (Uint8 *)dst; -#if HAVE_SSE2_INTRINSICS - const SDL_bool use_SSE2 = SDL_HasSSE2(); -#endif y = height; while (y--) { x = YUVwidth; -#if HAVE_SSE2_INTRINSICS - if (use_SSE2) { - PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(1, 2, 3, 0)); - } -#endif while (x--) { Uint8 Y1, U, Y2, V; @@ -1471,28 +1826,24 @@ static int SDL_ConvertPixels_YUY2_to_YVYU(int width, int height, const void *src dstYUV += dstYUVPitchLeft; } return 0; + } } static int SDL_ConvertPixels_UYVY_to_YUY2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) { + if (SDL_HasSSE2()) { + return SDL_ConvertPixels_UYVY_to_YUY2_SSE2(width, height, src, src_pitch, dst, dst_pitch); + } else { int x, y; const int YUVwidth = (width + 1) / 2; const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4); const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4); const Uint8 *srcYUV = (const Uint8 *)src; Uint8 *dstYUV = (Uint8 *)dst; -#if HAVE_SSE2_INTRINSICS - const SDL_bool use_SSE2 = SDL_HasSSE2(); -#endif y = height; while (y--) { x = YUVwidth; -#if HAVE_SSE2_INTRINSICS - if (use_SSE2) { - PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(2, 3, 0, 1)); - } -#endif while (x--) { Uint8 Y1, U, Y2, V; @@ -1512,28 +1863,24 @@ static int SDL_ConvertPixels_UYVY_to_YUY2(int width, int height, const void *src dstYUV += dstYUVPitchLeft; } return 0; + } } static int SDL_ConvertPixels_UYVY_to_YVYU(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) { + if (SDL_HasSSE2()) { + return SDL_ConvertPixels_UYVY_to_YVYU_SSE2(width, height, src, src_pitch, dst, dst_pitch); + } else { int x, y; const int YUVwidth = (width + 1) / 2; const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4); const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4); const Uint8 *srcYUV = (const Uint8 *)src; Uint8 *dstYUV = (Uint8 *)dst; -#if HAVE_SSE2_INTRINSICS - const SDL_bool use_SSE2 = SDL_HasSSE2(); -#endif y = height; while (y--) { x = YUVwidth; -#if HAVE_SSE2_INTRINSICS - if (use_SSE2) { - PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(0, 3, 2, 1)); - } -#endif while (x--) { Uint8 Y1, U, Y2, V; @@ -1553,28 +1900,24 @@ static int SDL_ConvertPixels_UYVY_to_YVYU(int width, int height, const void *src dstYUV += dstYUVPitchLeft; } return 0; + } } static int SDL_ConvertPixels_YVYU_to_YUY2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) { + if (SDL_HasSSE2()) { + return SDL_ConvertPixels_YVYU_to_YUY2_SSE2(width, height, src, src_pitch, dst, dst_pitch); + } else { int x, y; const int YUVwidth = (width + 1) / 2; const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4); const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4); const Uint8 *srcYUV = (const Uint8 *)src; Uint8 *dstYUV = (Uint8 *)dst; -#if HAVE_SSE2_INTRINSICS - const SDL_bool use_SSE2 = SDL_HasSSE2(); -#endif y = height; while (y--) { x = YUVwidth; -#if HAVE_SSE2_INTRINSICS - if (use_SSE2) { - PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(1, 2, 3, 0)); - } -#endif while (x--) { Uint8 Y1, U, Y2, V; @@ -1594,28 +1937,24 @@ static int SDL_ConvertPixels_YVYU_to_YUY2(int width, int height, const void *src dstYUV += dstYUVPitchLeft; } return 0; + } } static int SDL_ConvertPixels_YVYU_to_UYVY(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) { + if (SDL_HasSSE2()) { + return SDL_ConvertPixels_YVYU_to_UYVY_SSE2(width, height, src, src_pitch, dst, dst_pitch); + } else { int x, y; const int YUVwidth = (width + 1) / 2; const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4); const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4); const Uint8 *srcYUV = (const Uint8 *)src; Uint8 *dstYUV = (Uint8 *)dst; -#if HAVE_SSE2_INTRINSICS - const SDL_bool use_SSE2 = SDL_HasSSE2(); -#endif y = height; while (y--) { x = YUVwidth; -#if HAVE_SSE2_INTRINSICS - if (use_SSE2) { - PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(2, 1, 0, 3)); - } -#endif while (x--) { Uint8 Y1, U, Y2, V; @@ -1635,6 +1974,7 @@ static int SDL_ConvertPixels_YVYU_to_UYVY(int width, int height, const void *src dstYUV += dstYUVPitchLeft; } return 0; + } } static int SDL_ConvertPixels_Packed4_to_Packed4(int width, int height, diff --git a/src/video/yuv2rgb/yuv_rgb.c b/src/video/yuv2rgb/yuv_rgb.c index 5b96a4b7e..696e27282 100644 --- a/src/video/yuv2rgb/yuv_rgb.c +++ b/src/video/yuv2rgb/yuv_rgb.c @@ -609,7 +609,7 @@ V = _mm_srai_epi16(V, PRECISION); SAVE_SI128((__m128i*)(u_ptr), u1); \ SAVE_SI128((__m128i*)(v_ptr), v1); -void rgb24_yuv420_sse(uint32_t width, uint32_t height, +void SDL_TARGETING("sse2") rgb24_yuv420_sse(uint32_t width, uint32_t height, const uint8_t *RGB, uint32_t RGB_stride, uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, YCbCrType yuv_type) @@ -645,7 +645,7 @@ void rgb24_yuv420_sse(uint32_t width, uint32_t height, #undef SAVE_SI128 } -void rgb24_yuv420_sseu(uint32_t width, uint32_t height, +void SDL_TARGETING("sse2") rgb24_yuv420_sseu(uint32_t width, uint32_t height, const uint8_t *RGB, uint32_t RGB_stride, uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, YCbCrType yuv_type) diff --git a/src/video/yuv2rgb/yuv_rgb_sse_func.h b/src/video/yuv2rgb/yuv_rgb_sse_func.h index f541017a4..a3c15f258 100644 --- a/src/video/yuv2rgb/yuv_rgb_sse_func.h +++ b/src/video/yuv2rgb/yuv_rgb_sse_func.h @@ -382,7 +382,7 @@ PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ -void SSE_FUNCTION_NAME(uint32_t width, uint32_t height, +void SDL_TARGETING("sse2") SSE_FUNCTION_NAME(uint32_t width, uint32_t height, const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, uint8_t *RGB, uint32_t RGB_stride, YCbCrType yuv_type)