use target attributes of gcc / clang for SIMD code.

main
Ozkan Sezer 2023-03-17 20:57:40 +03:00 committed by Ozkan Sezer
parent cd7a3f8af5
commit 69de6964e5
12 changed files with 513 additions and 172 deletions

View File

@ -735,7 +735,6 @@ if(SDL_ASSEMBLY)
cmake_pop_check_state() cmake_pop_check_state()
if(CPU_SUPPORTS_AVX) if(CPU_SUPPORTS_AVX)
set(HAVE_AVX TRUE) set(HAVE_AVX TRUE)
target_compile_options(sdl-build-options INTERFACE "-mavx")
endif() endif()
endif() endif()
@ -760,7 +759,6 @@ if(SDL_ASSEMBLY)
cmake_pop_check_state() cmake_pop_check_state()
if(CPU_SUPPORTS_MMX) if(CPU_SUPPORTS_MMX)
set(HAVE_MMX TRUE) set(HAVE_MMX TRUE)
target_compile_options(sdl-build-options INTERFACE "-mmmx")
endif() endif()
endif() endif()
@ -785,7 +783,6 @@ if(SDL_ASSEMBLY)
cmake_pop_check_state() cmake_pop_check_state()
if(CPU_SUPPORTS_SSE) if(CPU_SUPPORTS_SSE)
set(HAVE_SSE ON) set(HAVE_SSE ON)
target_compile_options(sdl-build-options INTERFACE "-msse")
endif() endif()
endif() endif()
@ -810,7 +807,6 @@ if(SDL_ASSEMBLY)
cmake_pop_check_state() cmake_pop_check_state()
if(CPU_SUPPORTS_SSE2) if(CPU_SUPPORTS_SSE2)
set(HAVE_SSE2 TRUE) set(HAVE_SSE2 TRUE)
target_compile_options(sdl-build-options INTERFACE "-msse2")
endif() endif()
endif() endif()
@ -835,7 +831,6 @@ if(SDL_ASSEMBLY)
cmake_pop_check_state() cmake_pop_check_state()
if(CPU_SUPPORTS_SSE3) if(CPU_SUPPORTS_SSE3)
set(HAVE_SSE3 TRUE) set(HAVE_SSE3 TRUE)
target_compile_options(sdl-build-options INTERFACE "-msse3")
endif() endif()
endif() endif()

View File

@ -93,25 +93,39 @@ _m_prefetch(void *__P)
#endif #endif
#endif /* compiler version */ #endif /* compiler version */
#if defined(__clang__) && defined(__has_attribute)
# if __has_attribute(target)
# define SDL_HAS_TARGET_ATTRIBS
# endif
#elif defined(__GNUC__) && (__GNUC__ + (__GNUC_MINOR__ >= 9) > 4) /* gcc >= 4.9 */
# define SDL_HAS_TARGET_ATTRIBS
#endif
#ifdef SDL_HAS_TARGET_ATTRIBS
# define SDL_TARGETING(x) __attribute__((target(x)))
#else
# define SDL_TARGETING(x)
#endif
#if defined(__loongarch_sx) && !defined(SDL_DISABLE_LSX) #if defined(__loongarch_sx) && !defined(SDL_DISABLE_LSX)
#include <lsxintrin.h> #include <lsxintrin.h>
#endif #endif
#if defined(__loongarch_asx) && !defined(SDL_DISABLE_LASX) #if defined(__loongarch_asx) && !defined(SDL_DISABLE_LASX)
#include <lasxintrin.h> #include <lasxintrin.h>
#endif #endif
#if defined(__AVX__) && !defined(SDL_DISABLE_AVX) #if (defined(__AVX__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_AVX)
#include <immintrin.h> #include <immintrin.h>
#endif #endif
#if defined(__MMX__) && !defined(SDL_DISABLE_MMX) #if (defined(__MMX__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_MMX)
#include <mmintrin.h> #include <mmintrin.h>
#endif #endif
#if defined(__SSE__) && !defined(SDL_DISABLE_SSE) #if (defined(__SSE__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE)
#include <xmmintrin.h> #include <xmmintrin.h>
#endif #endif
#if defined(__SSE2__) && !defined(SDL_DISABLE_SSE2) #if (defined(__SSE2__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE2)
#include <emmintrin.h> #include <emmintrin.h>
#endif #endif
#if defined(__SSE3__) && !defined(SDL_DISABLE_SSE3) #if (defined(__SSE3__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE3)
#include <pmmintrin.h> #include <pmmintrin.h>
#endif #endif

View File

@ -194,23 +194,23 @@
#define HAVE_NEON_INTRINSICS 1 #define HAVE_NEON_INTRINSICS 1
#endif #endif
#if defined(__MMX__) && !defined(SDL_DISABLE_MMX) #if (defined(__MMX__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_MMX)
#define HAVE_MMX_INTRINSICS 1 #define HAVE_MMX_INTRINSICS 1
#endif #endif
#if defined(__SSE__) && !defined(SDL_DISABLE_SSE) #if (defined(__SSE__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE)
#define HAVE_SSE_INTRINSICS 1 #define HAVE_SSE_INTRINSICS 1
#endif #endif
#if defined(__SSE2__) && !defined(SDL_DISABLE_SSE2) #if (defined(__SSE2__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE2)
#define HAVE_SSE2_INTRINSICS 1 #define HAVE_SSE2_INTRINSICS 1
#endif #endif
#if defined(__SSE3__) && !defined(SDL_DISABLE_SSE3) #if (defined(__SSE3__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE3)
#define HAVE_SSE3_INTRINSICS 1 #define HAVE_SSE3_INTRINSICS 1
#endif #endif
#if defined(__AVX__) && !defined(SDL_DISABLE_AVX) #if (defined(__AVX__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_AVX)
#define HAVE_AVX_INTRINSICS 1 #define HAVE_AVX_INTRINSICS 1
#endif #endif
@ -222,19 +222,6 @@
#define HAVE_LASX_INTRINSICS 1 #define HAVE_LASX_INTRINSICS 1
#endif #endif
#if defined __clang__
#if (!__has_attribute(target))
#undef HAVE_AVX_INTRINSICS
#endif
#if (defined(_MSC_VER) || defined(__SCE__)) && !defined(__AVX__)
#undef HAVE_AVX_INTRINSICS
#endif
#elif defined __GNUC__
#if (__GNUC__ < 4) || (__GNUC__ == 4 && __GNUC_MINOR__ < 9)
#undef HAVE_AVX_INTRINSICS
#endif
#endif
#define SDL_MAIN_NOIMPL /* don't drag in header-only implementation of SDL_main */ #define SDL_MAIN_NOIMPL /* don't drag in header-only implementation of SDL_main */
#include <SDL3/SDL_main.h> #include <SDL3/SDL_main.h>

View File

@ -146,7 +146,7 @@ static int SDL_ConvertAudio(SDL_AudioCVT * cvt);
#if HAVE_SSE3_INTRINSICS #if HAVE_SSE3_INTRINSICS
/* Convert from stereo to mono. Average left and right. */ /* Convert from stereo to mono. Average left and right. */
static void SDLCALL SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT *cvt, SDL_AudioFormat format) static void SDLCALL SDL_TARGETING("sse3") SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT *cvt, SDL_AudioFormat format)
{ {
const __m128 divby2 = _mm_set1_ps(0.5f); const __m128 divby2 = _mm_set1_ps(0.5f);
float *dst = (float *)cvt->buf; float *dst = (float *)cvt->buf;
@ -183,7 +183,7 @@ static void SDLCALL SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT *cvt, SDL_AudioFor
#if HAVE_SSE_INTRINSICS #if HAVE_SSE_INTRINSICS
/* Convert from mono to stereo. Duplicate to stereo left and right. */ /* Convert from mono to stereo. Duplicate to stereo left and right. */
static void SDLCALL SDL_ConvertMonoToStereo_SSE(SDL_AudioCVT *cvt, SDL_AudioFormat format) static void SDLCALL SDL_TARGETING("sse") SDL_ConvertMonoToStereo_SSE(SDL_AudioCVT *cvt, SDL_AudioFormat format)
{ {
float *dst = ((float *)(cvt->buf + (cvt->len_cvt * 2))) - 8; float *dst = ((float *)(cvt->buf + (cvt->len_cvt * 2))) - 8;
const float *src = ((const float *)(cvt->buf + cvt->len_cvt)) - 4; const float *src = ((const float *)(cvt->buf + cvt->len_cvt)) - 4;

View File

@ -225,7 +225,7 @@ static void SDLCALL SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFo
#endif #endif
#if HAVE_SSE2_INTRINSICS #if HAVE_SSE2_INTRINSICS
static void SDLCALL SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
{ {
const Sint8 *src = ((const Sint8 *)(cvt->buf + cvt->len_cvt)) - 1; const Sint8 *src = ((const Sint8 *)(cvt->buf + cvt->len_cvt)) - 1;
float *dst = ((float *)(cvt->buf + cvt->len_cvt * 4)) - 1; float *dst = ((float *)(cvt->buf + cvt->len_cvt * 4)) - 1;
@ -289,7 +289,7 @@ static void SDLCALL SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioForma
} }
} }
static void SDLCALL SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
{ {
const Uint8 *src = ((const Uint8 *)(cvt->buf + cvt->len_cvt)) - 1; const Uint8 *src = ((const Uint8 *)(cvt->buf + cvt->len_cvt)) - 1;
float *dst = ((float *)(cvt->buf + cvt->len_cvt * 4)) - 1; float *dst = ((float *)(cvt->buf + cvt->len_cvt * 4)) - 1;
@ -355,7 +355,7 @@ static void SDLCALL SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioForma
} }
} }
static void SDLCALL SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
{ {
const Sint16 *src = ((const Sint16 *)(cvt->buf + cvt->len_cvt)) - 1; const Sint16 *src = ((const Sint16 *)(cvt->buf + cvt->len_cvt)) - 1;
float *dst = ((float *)(cvt->buf + cvt->len_cvt * 2)) - 1; float *dst = ((float *)(cvt->buf + cvt->len_cvt * 2)) - 1;
@ -408,7 +408,7 @@ static void SDLCALL SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioForm
} }
} }
static void SDLCALL SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
{ {
const Sint32 *src = (const Sint32 *)cvt->buf; const Sint32 *src = (const Sint32 *)cvt->buf;
float *dst = (float *)cvt->buf; float *dst = (float *)cvt->buf;
@ -451,7 +451,7 @@ static void SDLCALL SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioForm
} }
} }
static void SDLCALL SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
{ {
const float *src = (const float *)cvt->buf; const float *src = (const float *)cvt->buf;
Sint8 *dst = (Sint8 *)cvt->buf; Sint8 *dst = (Sint8 *)cvt->buf;
@ -514,7 +514,7 @@ static void SDLCALL SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioForma
} }
} }
static void SDLCALL SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
{ {
const float *src = (const float *)cvt->buf; const float *src = (const float *)cvt->buf;
Uint8 *dst = cvt->buf; Uint8 *dst = cvt->buf;
@ -577,7 +577,7 @@ static void SDLCALL SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioForma
} }
} }
static void SDLCALL SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
{ {
const float *src = (const float *)cvt->buf; const float *src = (const float *)cvt->buf;
Sint16 *dst = (Sint16 *)cvt->buf; Sint16 *dst = (Sint16 *)cvt->buf;
@ -638,7 +638,7 @@ static void SDLCALL SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioForm
} }
} }
static void SDLCALL SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
{ {
const float *src = (const float *)cvt->buf; const float *src = (const float *)cvt->buf;
Sint32 *dst = (Sint32 *)cvt->buf; Sint32 *dst = (Sint32 *)cvt->buf;

View File

@ -169,7 +169,7 @@ static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
#if HAVE_MMX_INTRINSICS #if HAVE_MMX_INTRINSICS
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info) static void SDL_TARGETING("mmx") BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
{ {
int width = info->dst_w; int width = info->dst_w;
int height = info->dst_h; int height = info->dst_h;
@ -223,7 +223,7 @@ static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
} }
/* fast RGB888->(A)RGB888 blending with surface alpha */ /* fast RGB888->(A)RGB888 blending with surface alpha */
static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info) static void SDL_TARGETING("mmx") BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
{ {
SDL_PixelFormat *df = info->dst_fmt; SDL_PixelFormat *df = info->dst_fmt;
Uint32 chanmask; Uint32 chanmask;
@ -318,7 +318,7 @@ static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
} }
/* fast ARGB888->(A)RGB888 blending with pixel alpha */ /* fast ARGB888->(A)RGB888 blending with pixel alpha */
static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info) static void SDL_TARGETING("mmx") BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
{ {
int width = info->dst_w; int width = info->dst_w;
int height = info->dst_h; int height = info->dst_h;
@ -753,7 +753,7 @@ static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
#if HAVE_MMX_INTRINSICS #if HAVE_MMX_INTRINSICS
/* fast RGB565->RGB565 blending with surface alpha */ /* fast RGB565->RGB565 blending with surface alpha */
static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info) static void SDL_TARGETING("mmx") Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
{ {
unsigned alpha = info->a; unsigned alpha = info->a;
if (alpha == 128) { if (alpha == 128) {
@ -889,7 +889,7 @@ static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
} }
/* fast RGB555->RGB555 blending with surface alpha */ /* fast RGB555->RGB555 blending with surface alpha */
static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info) static void SDL_TARGETING("mmx") Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
{ {
unsigned alpha = info->a; unsigned alpha = info->a;
if (alpha == 128) { if (alpha == 128) {

View File

@ -25,7 +25,7 @@
#if HAVE_SSE_INTRINSICS #if HAVE_SSE_INTRINSICS
/* This assumes 16-byte aligned src and dst */ /* This assumes 16-byte aligned src and dst */
static SDL_INLINE void SDL_memcpySSE(Uint8 *dst, const Uint8 *src, int len) static SDL_INLINE void SDL_TARGETING("sse") SDL_memcpySSE(Uint8 *dst, const Uint8 *src, int len)
{ {
int i; int i;
@ -54,7 +54,7 @@ static SDL_INLINE void SDL_memcpySSE(Uint8 *dst, const Uint8 *src, int len)
#ifdef _MSC_VER #ifdef _MSC_VER
#pragma warning(disable : 4799) #pragma warning(disable : 4799)
#endif #endif
static SDL_INLINE void SDL_memcpyMMX(Uint8 *dst, const Uint8 *src, int len) static SDL_INLINE void SDL_TARGETING("mmx") SDL_memcpyMMX(Uint8 *dst, const Uint8 *src, int len)
{ {
const int remain = (len & 63); const int remain = (len & 63);
int i; int i;
@ -81,6 +81,16 @@ static SDL_INLINE void SDL_memcpyMMX(Uint8 *dst, const Uint8 *src, int len)
SDL_memcpy(dst + skip, src + skip, remain); SDL_memcpy(dst + skip, src + skip, remain);
} }
} }
static SDL_INLINE void SDL_TARGETING("mmx") SDL_BlitCopyMMX(Uint8 *dst, const Uint8 *src, const int dstskip, const int srcskip, const int w, int h)
{
while (h--) {
SDL_memcpyMMX(dst, src, w);
src += srcskip;
dst += dstskip;
}
_mm_empty();
}
#endif /* HAVE_MMX_INTRINSICS */ #endif /* HAVE_MMX_INTRINSICS */
void SDL_BlitCopy(SDL_BlitInfo *info) void SDL_BlitCopy(SDL_BlitInfo *info)
@ -137,12 +147,7 @@ void SDL_BlitCopy(SDL_BlitInfo *info)
#if HAVE_MMX_INTRINSICS #if HAVE_MMX_INTRINSICS
if (SDL_HasMMX() && !(srcskip & 7) && !(dstskip & 7)) { if (SDL_HasMMX() && !(srcskip & 7) && !(dstskip & 7)) {
while (h--) { SDL_BlitCopyMMX(dst, src, w, h, dstskip, srcskip);
SDL_memcpyMMX(dst, src, w);
src += srcskip;
dst += dstskip;
}
_mm_empty();
return; return;
} }
#endif #endif

View File

@ -55,7 +55,7 @@
#define SSE_END #define SSE_END
#define DEFINE_SSE_FILLRECT(bpp, type) \ #define DEFINE_SSE_FILLRECT(bpp, type) \
static void SDL_FillSurfaceRect##bpp##SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \ static void SDL_TARGETING("sse") SDL_FillSurfaceRect##bpp##SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \
{ \ { \
int i, n; \ int i, n; \
Uint8 *p = NULL; \ Uint8 *p = NULL; \
@ -92,7 +92,7 @@ static void SDL_FillSurfaceRect##bpp##SSE(Uint8 *pixels, int pitch, Uint32 color
SSE_END; \ SSE_END; \
} }
static void SDL_FillSurfaceRect1SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h) static void SDL_TARGETING("sse") SDL_FillSurfaceRect1SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h)
{ {
int i, n; int i, n;

View File

@ -349,7 +349,7 @@ static int scale_mat(const Uint32 *src, int src_w, int src_h, int src_pitch,
#if defined(HAVE_SSE2_INTRINSICS) #if defined(HAVE_SSE2_INTRINSICS)
#if 0 #if 0
static void printf_128(const char *str, __m128i var) static void SDL_TARGETING("sse2") printf_128(const char *str, __m128i var)
{ {
uint16_t *val = (uint16_t*) &var; uint16_t *val = (uint16_t*) &var;
printf(" * %s: %04x %04x %04x %04x _ %04x %04x %04x %04x\n", printf(" * %s: %04x %04x %04x %04x _ %04x %04x %04x %04x\n",
@ -367,7 +367,7 @@ static SDL_INLINE int hasSSE2(void)
return val; return val;
} }
static SDL_INLINE void INTERPOL_BILINEAR_SSE(const Uint32 *s0, const Uint32 *s1, int frac_w, __m128i v_frac_h0, __m128i v_frac_h1, Uint32 *dst, __m128i zero) static SDL_INLINE void SDL_TARGETING("sse2") INTERPOL_BILINEAR_SSE(const Uint32 *s0, const Uint32 *s1, int frac_w, __m128i v_frac_h0, __m128i v_frac_h1, Uint32 *dst, __m128i zero)
{ {
__m128i x_00_01, x_10_11; /* Pixels in 4*uint8 in row */ __m128i x_00_01, x_10_11; /* Pixels in 4*uint8 in row */
__m128i v_frac_w0, k0, l0, d0, e0; __m128i v_frac_w0, k0, l0, d0, e0;
@ -404,7 +404,7 @@ static SDL_INLINE void INTERPOL_BILINEAR_SSE(const Uint32 *s0, const Uint32 *s1,
*dst = _mm_cvtsi128_si32(e0); *dst = _mm_cvtsi128_si32(e0);
} }
static int scale_mat_SSE(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch) static int SDL_TARGETING("sse2") scale_mat_SSE(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
{ {
BILINEAR___START BILINEAR___START

View File

@ -303,14 +303,14 @@ static int GetYUVPlanes(int width, int height, Uint32 format, const void *yuv, i
return 0; return 0;
} }
static SDL_bool yuv_rgb_sse( #if HAVE_SSE2_INTRINSICS
static SDL_bool SDL_TARGETING("sse2") yuv_rgb_sse(
Uint32 src_format, Uint32 dst_format, Uint32 src_format, Uint32 dst_format,
Uint32 width, Uint32 height, Uint32 width, Uint32 height,
const Uint8 *y, const Uint8 *u, const Uint8 *v, Uint32 y_stride, Uint32 uv_stride, const Uint8 *y, const Uint8 *u, const Uint8 *v, Uint32 y_stride, Uint32 uv_stride,
Uint8 *rgb, Uint32 rgb_stride, Uint8 *rgb, Uint32 rgb_stride,
YCbCrType yuv_type) YCbCrType yuv_type)
{ {
#if HAVE_SSE2_INTRINSICS
if (!SDL_HasSSE2()) { if (!SDL_HasSSE2()) {
return SDL_FALSE; return SDL_FALSE;
} }
@ -408,10 +408,21 @@ static SDL_bool yuv_rgb_sse(
break; break;
} }
} }
#endif
return SDL_FALSE; return SDL_FALSE;
} }
#else
static SDL_bool yuv_rgb_sse(
Uint32 src_format, Uint32 dst_format,
Uint32 width, Uint32 height,
const Uint8 *y, const Uint8 *u, const Uint8 *v, Uint32 y_stride, Uint32 uv_stride,
Uint8 *rgb, Uint32 rgb_stride,
YCbCrType yuv_type)
{
return SDL_FALSE;
}
#endif
#if HAVE_LSX_INTRINSICS
static SDL_bool yuv_rgb_lsx( static SDL_bool yuv_rgb_lsx(
Uint32 src_format, Uint32 dst_format, Uint32 src_format, Uint32 dst_format,
Uint32 width, Uint32 height, Uint32 width, Uint32 height,
@ -419,7 +430,6 @@ static SDL_bool yuv_rgb_lsx(
Uint8 *rgb, Uint32 rgb_stride, Uint8 *rgb, Uint32 rgb_stride,
YCbCrType yuv_type) YCbCrType yuv_type)
{ {
#if HAVE_LSX_INTRINSICS
if (!SDL_HasLSX()) { if (!SDL_HasLSX()) {
return SDL_FALSE; return SDL_FALSE;
} }
@ -450,9 +460,19 @@ static SDL_bool yuv_rgb_lsx(
break; break;
} }
} }
#endif
return SDL_FALSE; return SDL_FALSE;
} }
#else
static SDL_bool yuv_rgb_lsx(
Uint32 src_format, Uint32 dst_format,
Uint32 width, Uint32 height,
const Uint8 *y, const Uint8 *u, const Uint8 *v, Uint32 y_stride, Uint32 uv_stride,
Uint8 *rgb, Uint32 rgb_stride,
YCbCrType yuv_type)
{
return SDL_FALSE;
}
#endif
static SDL_bool yuv_rgb_std( static SDL_bool yuv_rgb_std(
Uint32 src_format, Uint32 dst_format, Uint32 src_format, Uint32 dst_format,
@ -1102,7 +1122,8 @@ static int SDL_ConvertPixels_SwapUVPlanes(int width, int height, const void *src
return 0; return 0;
} }
static int SDL_ConvertPixels_PackUVPlanes_to_NV(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV) #if HAVE_SSE2_INTRINSICS
static int SDL_TARGETING("sse2") SDL_ConvertPixels_PackUVPlanes_to_NV_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV)
{ {
int x, y; int x, y;
const int UVwidth = (width + 1) / 2; const int UVwidth = (width + 1) / 2;
@ -1114,9 +1135,6 @@ static int SDL_ConvertPixels_PackUVPlanes_to_NV(int width, int height, const voi
const Uint8 *src1, *src2; const Uint8 *src1, *src2;
Uint8 *dstUV; Uint8 *dstUV;
Uint8 *tmp = NULL; Uint8 *tmp = NULL;
#if HAVE_SSE2_INTRINSICS
const SDL_bool use_SSE2 = SDL_HasSSE2();
#endif
/* Skip the Y plane */ /* Skip the Y plane */
src = (const Uint8 *)src + height * src_pitch; src = (const Uint8 *)src + height * src_pitch;
@ -1144,22 +1162,18 @@ static int SDL_ConvertPixels_PackUVPlanes_to_NV(int width, int height, const voi
y = UVheight; y = UVheight;
while (y--) { while (y--) {
x = UVwidth; x = UVwidth;
#if HAVE_SSE2_INTRINSICS while (x >= 16) {
if (use_SSE2) { __m128i u = _mm_loadu_si128((__m128i *)src1);
while (x >= 16) { __m128i v = _mm_loadu_si128((__m128i *)src2);
__m128i u = _mm_loadu_si128((__m128i *)src1); __m128i uv1 = _mm_unpacklo_epi8(u, v);
__m128i v = _mm_loadu_si128((__m128i *)src2); __m128i uv2 = _mm_unpackhi_epi8(u, v);
__m128i uv1 = _mm_unpacklo_epi8(u, v); _mm_storeu_si128((__m128i *)dstUV, uv1);
__m128i uv2 = _mm_unpackhi_epi8(u, v); _mm_storeu_si128((__m128i *)(dstUV + 16), uv2);
_mm_storeu_si128((__m128i *)dstUV, uv1); src1 += 16;
_mm_storeu_si128((__m128i *)(dstUV + 16), uv2); src2 += 16;
src1 += 16; dstUV += 32;
src2 += 16; x -= 16;
dstUV += 32;
x -= 16;
}
} }
#endif
while (x--) { while (x--) {
*dstUV++ = *src1++; *dstUV++ = *src1++;
*dstUV++ = *src2++; *dstUV++ = *src2++;
@ -1174,8 +1188,68 @@ static int SDL_ConvertPixels_PackUVPlanes_to_NV(int width, int height, const voi
} }
return 0; return 0;
} }
#endif
static int SDL_ConvertPixels_SplitNV_to_UVPlanes(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV) static int SDL_ConvertPixels_PackUVPlanes_to_NV(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV)
{
if (SDL_HasSSE2()) {
return SDL_ConvertPixels_PackUVPlanes_to_NV_SSE2(width, height, src, src_pitch, dst, dst_pitch, reverseUV);
} else {
int x, y;
const int UVwidth = (width + 1) / 2;
const int UVheight = (height + 1) / 2;
const int srcUVPitch = ((src_pitch + 1) / 2);
const int srcUVPitchLeft = srcUVPitch - UVwidth;
const int dstUVPitch = ((dst_pitch + 1) / 2) * 2;
const int dstUVPitchLeft = dstUVPitch - UVwidth * 2;
const Uint8 *src1, *src2;
Uint8 *dstUV;
Uint8 *tmp = NULL;
/* Skip the Y plane */
src = (const Uint8 *)src + height * src_pitch;
dst = (Uint8 *)dst + height * dst_pitch;
if (src == dst) {
/* Need to make a copy of the buffer so we don't clobber it while converting */
tmp = (Uint8 *)SDL_malloc((size_t)2 * UVheight * srcUVPitch);
if (tmp == NULL) {
return SDL_OutOfMemory();
}
SDL_memcpy(tmp, src, (size_t)2 * UVheight * srcUVPitch);
src = tmp;
}
if (reverseUV) {
src2 = (const Uint8 *)src;
src1 = src2 + UVheight * srcUVPitch;
} else {
src1 = (const Uint8 *)src;
src2 = src1 + UVheight * srcUVPitch;
}
dstUV = (Uint8 *)dst;
y = UVheight;
while (y--) {
x = UVwidth;
while (x--) {
*dstUV++ = *src1++;
*dstUV++ = *src2++;
}
src1 += srcUVPitchLeft;
src2 += srcUVPitchLeft;
dstUV += dstUVPitchLeft;
}
if (tmp) {
SDL_free(tmp);
}
return 0;
}
}
#if HAVE_SSE2_INTRINSICS
static int SDL_TARGETING("sse2") SDL_ConvertPixels_SplitNV_to_UVPlanes_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV)
{ {
int x, y; int x, y;
const int UVwidth = (width + 1) / 2; const int UVwidth = (width + 1) / 2;
@ -1187,10 +1261,82 @@ static int SDL_ConvertPixels_SplitNV_to_UVPlanes(int width, int height, const vo
const Uint8 *srcUV; const Uint8 *srcUV;
Uint8 *dst1, *dst2; Uint8 *dst1, *dst2;
Uint8 *tmp = NULL; Uint8 *tmp = NULL;
#if HAVE_SSE2_INTRINSICS
const SDL_bool use_SSE2 = SDL_HasSSE2(); /* Skip the Y plane */
src = (const Uint8 *)src + height * src_pitch;
dst = (Uint8 *)dst + height * dst_pitch;
if (src == dst) {
/* Need to make a copy of the buffer so we don't clobber it while converting */
tmp = (Uint8 *)SDL_malloc((size_t)UVheight * srcUVPitch);
if (tmp == NULL) {
return SDL_OutOfMemory();
}
SDL_memcpy(tmp, src, (size_t)UVheight * srcUVPitch);
src = tmp;
}
if (reverseUV) {
dst2 = (Uint8 *)dst;
dst1 = dst2 + UVheight * dstUVPitch;
} else {
dst1 = (Uint8 *)dst;
dst2 = dst1 + UVheight * dstUVPitch;
}
srcUV = (const Uint8 *)src;
y = UVheight;
while (y--) {
__m128i mask = _mm_set1_epi16(0x00FF);
x = UVwidth;
while (x >= 16) {
__m128i uv1 = _mm_loadu_si128((__m128i *)srcUV);
__m128i uv2 = _mm_loadu_si128((__m128i *)(srcUV + 16));
__m128i u1 = _mm_and_si128(uv1, mask);
__m128i u2 = _mm_and_si128(uv2, mask);
__m128i u = _mm_packus_epi16(u1, u2);
__m128i v1 = _mm_srli_epi16(uv1, 8);
__m128i v2 = _mm_srli_epi16(uv2, 8);
__m128i v = _mm_packus_epi16(v1, v2);
_mm_storeu_si128((__m128i *)dst1, u);
_mm_storeu_si128((__m128i *)dst2, v);
srcUV += 32;
dst1 += 16;
dst2 += 16;
x -= 16;
}
while (x--) {
*dst1++ = *srcUV++;
*dst2++ = *srcUV++;
}
srcUV += srcUVPitchLeft;
dst1 += dstUVPitchLeft;
dst2 += dstUVPitchLeft;
}
if (tmp) {
SDL_free(tmp);
}
return 0;
}
#endif #endif
static int SDL_ConvertPixels_SplitNV_to_UVPlanes(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV)
{
if (SDL_HasSSE2()) {
return SDL_ConvertPixels_SplitNV_to_UVPlanes_SSE2(width, height, src, src_pitch, dst, dst_pitch, reverseUV);
} else {
int x, y;
const int UVwidth = (width + 1) / 2;
const int UVheight = (height + 1) / 2;
const int srcUVPitch = ((src_pitch + 1) / 2) * 2;
const int srcUVPitchLeft = srcUVPitch - UVwidth * 2;
const int dstUVPitch = ((dst_pitch + 1) / 2);
const int dstUVPitchLeft = dstUVPitch - UVwidth;
const Uint8 *srcUV;
Uint8 *dst1, *dst2;
Uint8 *tmp = NULL;
/* Skip the Y plane */ /* Skip the Y plane */
src = (const Uint8 *)src + height * src_pitch; src = (const Uint8 *)src + height * src_pitch;
dst = (Uint8 *)dst + height * dst_pitch; dst = (Uint8 *)dst + height * dst_pitch;
@ -1217,27 +1363,6 @@ static int SDL_ConvertPixels_SplitNV_to_UVPlanes(int width, int height, const vo
y = UVheight; y = UVheight;
while (y--) { while (y--) {
x = UVwidth; x = UVwidth;
#if HAVE_SSE2_INTRINSICS
if (use_SSE2) {
__m128i mask = _mm_set1_epi16(0x00FF);
while (x >= 16) {
__m128i uv1 = _mm_loadu_si128((__m128i *)srcUV);
__m128i uv2 = _mm_loadu_si128((__m128i *)(srcUV + 16));
__m128i u1 = _mm_and_si128(uv1, mask);
__m128i u2 = _mm_and_si128(uv2, mask);
__m128i u = _mm_packus_epi16(u1, u2);
__m128i v1 = _mm_srli_epi16(uv1, 8);
__m128i v2 = _mm_srli_epi16(uv2, 8);
__m128i v = _mm_packus_epi16(v1, v2);
_mm_storeu_si128((__m128i *)dst1, u);
_mm_storeu_si128((__m128i *)dst2, v);
srcUV += 32;
dst1 += 16;
dst2 += 16;
x -= 16;
}
}
#endif
while (x--) { while (x--) {
*dst1++ = *srcUV++; *dst1++ = *srcUV++;
*dst2++ = *srcUV++; *dst2++ = *srcUV++;
@ -1251,9 +1376,11 @@ static int SDL_ConvertPixels_SplitNV_to_UVPlanes(int width, int height, const vo
SDL_free(tmp); SDL_free(tmp);
} }
return 0; return 0;
}
} }
static int SDL_ConvertPixels_SwapNV(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) #if HAVE_SSE2_INTRINSICS
static int SDL_TARGETING("sse2") SDL_ConvertPixels_SwapNV_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
{ {
int x, y; int x, y;
const int UVwidth = (width + 1) / 2; const int UVwidth = (width + 1) / 2;
@ -1264,9 +1391,6 @@ static int SDL_ConvertPixels_SwapNV(int width, int height, const void *src, int
const int dstUVPitchLeft = (dstUVPitch - UVwidth * 2) / sizeof(Uint16); const int dstUVPitchLeft = (dstUVPitch - UVwidth * 2) / sizeof(Uint16);
const Uint16 *srcUV; const Uint16 *srcUV;
Uint16 *dstUV; Uint16 *dstUV;
#if HAVE_SSE2_INTRINSICS
const SDL_bool use_SSE2 = SDL_HasSSE2();
#endif
/* Skip the Y plane */ /* Skip the Y plane */
src = (const Uint8 *)src + height * src_pitch; src = (const Uint8 *)src + height * src_pitch;
@ -1277,20 +1401,16 @@ static int SDL_ConvertPixels_SwapNV(int width, int height, const void *src, int
y = UVheight; y = UVheight;
while (y--) { while (y--) {
x = UVwidth; x = UVwidth;
#if HAVE_SSE2_INTRINSICS while (x >= 8) {
if (use_SSE2) { __m128i uv = _mm_loadu_si128((__m128i *)srcUV);
while (x >= 8) { __m128i v = _mm_slli_epi16(uv, 8);
__m128i uv = _mm_loadu_si128((__m128i *)srcUV); __m128i u = _mm_srli_epi16(uv, 8);
__m128i v = _mm_slli_epi16(uv, 8); __m128i vu = _mm_or_si128(v, u);
__m128i u = _mm_srli_epi16(uv, 8); _mm_storeu_si128((__m128i *)dstUV, vu);
__m128i vu = _mm_or_si128(v, u); srcUV += 8;
_mm_storeu_si128((__m128i *)dstUV, vu); dstUV += 8;
srcUV += 8; x -= 8;
dstUV += 8;
x -= 8;
}
} }
#endif
while (x--) { while (x--) {
*dstUV++ = SDL_Swap16(*srcUV++); *dstUV++ = SDL_Swap16(*srcUV++);
} }
@ -1299,6 +1419,41 @@ static int SDL_ConvertPixels_SwapNV(int width, int height, const void *src, int
} }
return 0; return 0;
} }
#endif
static int SDL_ConvertPixels_SwapNV(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
{
if (SDL_HasSSE2()) {
return SDL_ConvertPixels_SwapNV_SSE2(width, height, src, src_pitch, dst, dst_pitch);
} else {
int x, y;
const int UVwidth = (width + 1) / 2;
const int UVheight = (height + 1) / 2;
const int srcUVPitch = ((src_pitch + 1) / 2) * 2;
const int srcUVPitchLeft = (srcUVPitch - UVwidth * 2) / sizeof(Uint16);
const int dstUVPitch = ((dst_pitch + 1) / 2) * 2;
const int dstUVPitchLeft = (dstUVPitch - UVwidth * 2) / sizeof(Uint16);
const Uint16 *srcUV;
Uint16 *dstUV;
/* Skip the Y plane */
src = (const Uint8 *)src + height * src_pitch;
dst = (Uint8 *)dst + height * dst_pitch;
srcUV = (const Uint16 *)src;
dstUV = (Uint16 *)dst;
y = UVheight;
while (y--) {
x = UVwidth;
while (x--) {
*dstUV++ = SDL_Swap16(*srcUV++);
}
srcUV += srcUVPitchLeft;
dstUV += dstUVPitchLeft;
}
return 0;
}
}
static int SDL_ConvertPixels_Planar2x2_to_Planar2x2(int width, int height, static int SDL_ConvertPixels_Planar2x2_to_Planar2x2(int width, int height,
Uint32 src_format, const void *src, int src_pitch, Uint32 src_format, const void *src, int src_pitch,
@ -1389,28 +1544,232 @@ static int SDL_ConvertPixels_Planar2x2_to_Planar2x2(int width, int height,
x -= 4; \ x -= 4; \
} }
static int SDL_TARGETING("sse2") SDL_ConvertPixels_YUY2_to_UYVY_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
{
int x, y;
const int YUVwidth = (width + 1) / 2;
const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4);
const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4);
const Uint8 *srcYUV = (const Uint8 *)src;
Uint8 *dstYUV = (Uint8 *)dst;
y = height;
x = YUVwidth;
while (y--) {
PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(2, 3, 0, 1));
while (x--) {
Uint8 Y1, U, Y2, V;
Y1 = srcYUV[0];
U = srcYUV[1];
Y2 = srcYUV[2];
V = srcYUV[3];
srcYUV += 4;
dstYUV[0] = U;
dstYUV[1] = Y1;
dstYUV[2] = V;
dstYUV[3] = Y2;
dstYUV += 4;
}
srcYUV += srcYUVPitchLeft;
dstYUV += dstYUVPitchLeft;
x = YUVwidth;
}
return 0;
}
static int SDL_TARGETING("sse2") SDL_ConvertPixels_YUY2_to_YVYU_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
{
int x, y;
const int YUVwidth = (width + 1) / 2;
const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4);
const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4);
const Uint8 *srcYUV = (const Uint8 *)src;
Uint8 *dstYUV = (Uint8 *)dst;
y = height;
x = YUVwidth;
while (y--) {
PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(1, 2, 3, 0));
while (x--) {
Uint8 Y1, U, Y2, V;
Y1 = srcYUV[0];
U = srcYUV[1];
Y2 = srcYUV[2];
V = srcYUV[3];
srcYUV += 4;
dstYUV[0] = Y1;
dstYUV[1] = V;
dstYUV[2] = Y2;
dstYUV[3] = U;
dstYUV += 4;
}
srcYUV += srcYUVPitchLeft;
dstYUV += dstYUVPitchLeft;
x = YUVwidth;
}
return 0;
}
static int SDL_TARGETING("sse2") SDL_ConvertPixels_UYVY_to_YUY2_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
{
int x, y;
const int YUVwidth = (width + 1) / 2;
const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4);
const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4);
const Uint8 *srcYUV = (const Uint8 *)src;
Uint8 *dstYUV = (Uint8 *)dst;
y = height;
x = YUVwidth;
while (y--) {
PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(2, 3, 0, 1));
while (x--) {
Uint8 Y1, U, Y2, V;
U = srcYUV[0];
Y1 = srcYUV[1];
V = srcYUV[2];
Y2 = srcYUV[3];
srcYUV += 4;
dstYUV[0] = Y1;
dstYUV[1] = U;
dstYUV[2] = Y2;
dstYUV[3] = V;
dstYUV += 4;
}
srcYUV += srcYUVPitchLeft;
dstYUV += dstYUVPitchLeft;
x = YUVwidth;
}
return 0;
}
static int SDL_TARGETING("sse2") SDL_ConvertPixels_UYVY_to_YVYU_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
{
int x, y;
const int YUVwidth = (width + 1) / 2;
const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4);
const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4);
const Uint8 *srcYUV = (const Uint8 *)src;
Uint8 *dstYUV = (Uint8 *)dst;
y = height;
x = YUVwidth;
while (y--) {
PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(0, 3, 2, 1));
while (x--) {
Uint8 Y1, U, Y2, V;
U = srcYUV[0];
Y1 = srcYUV[1];
V = srcYUV[2];
Y2 = srcYUV[3];
srcYUV += 4;
dstYUV[0] = Y1;
dstYUV[1] = V;
dstYUV[2] = Y2;
dstYUV[3] = U;
dstYUV += 4;
}
srcYUV += srcYUVPitchLeft;
dstYUV += dstYUVPitchLeft;
x = YUVwidth;
}
return 0;
}
static int SDL_TARGETING("sse2") SDL_ConvertPixels_YVYU_to_YUY2_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
{
int x, y;
const int YUVwidth = (width + 1) / 2;
const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4);
const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4);
const Uint8 *srcYUV = (const Uint8 *)src;
Uint8 *dstYUV = (Uint8 *)dst;
y = height;
x = YUVwidth;
while (y--) {
PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(1, 2, 3, 0));
while (x--) {
Uint8 Y1, U, Y2, V;
Y1 = srcYUV[0];
V = srcYUV[1];
Y2 = srcYUV[2];
U = srcYUV[3];
srcYUV += 4;
dstYUV[0] = Y1;
dstYUV[1] = U;
dstYUV[2] = Y2;
dstYUV[3] = V;
dstYUV += 4;
}
srcYUV += srcYUVPitchLeft;
dstYUV += dstYUVPitchLeft;
x = YUVwidth;
}
return 0;
}
static int SDL_TARGETING("sse2") SDL_ConvertPixels_YVYU_to_UYVY_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
{
int x, y;
const int YUVwidth = (width + 1) / 2;
const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4);
const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4);
const Uint8 *srcYUV = (const Uint8 *)src;
Uint8 *dstYUV = (Uint8 *)dst;
y = height;
x = YUVwidth;
while (y--) {
PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(2, 1, 0, 3));
while (x--) {
Uint8 Y1, U, Y2, V;
Y1 = srcYUV[0];
V = srcYUV[1];
Y2 = srcYUV[2];
U = srcYUV[3];
srcYUV += 4;
dstYUV[0] = U;
dstYUV[1] = Y1;
dstYUV[2] = V;
dstYUV[3] = Y2;
dstYUV += 4;
}
srcYUV += srcYUVPitchLeft;
dstYUV += dstYUVPitchLeft;
x = YUVwidth;
}
return 0;
}
#endif #endif
static int SDL_ConvertPixels_YUY2_to_UYVY(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) static int SDL_ConvertPixels_YUY2_to_UYVY(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
{ {
if (SDL_HasSSE2()) {
return SDL_ConvertPixels_YUY2_to_UYVY_SSE2(width, height, src, src_pitch, dst, dst_pitch);
} else {
int x, y; int x, y;
const int YUVwidth = (width + 1) / 2; const int YUVwidth = (width + 1) / 2;
const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4); const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4);
const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4); const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4);
const Uint8 *srcYUV = (const Uint8 *)src; const Uint8 *srcYUV = (const Uint8 *)src;
Uint8 *dstYUV = (Uint8 *)dst; Uint8 *dstYUV = (Uint8 *)dst;
#if HAVE_SSE2_INTRINSICS
const SDL_bool use_SSE2 = SDL_HasSSE2();
#endif
y = height; y = height;
while (y--) { while (y--) {
x = YUVwidth; x = YUVwidth;
#if HAVE_SSE2_INTRINSICS
if (use_SSE2) {
PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(2, 3, 0, 1));
}
#endif
while (x--) { while (x--) {
Uint8 Y1, U, Y2, V; Uint8 Y1, U, Y2, V;
@ -1430,28 +1789,24 @@ static int SDL_ConvertPixels_YUY2_to_UYVY(int width, int height, const void *src
dstYUV += dstYUVPitchLeft; dstYUV += dstYUVPitchLeft;
} }
return 0; return 0;
}
} }
static int SDL_ConvertPixels_YUY2_to_YVYU(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) static int SDL_ConvertPixels_YUY2_to_YVYU(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
{ {
if (SDL_HasSSE2()) {
return SDL_ConvertPixels_YUY2_to_YVYU_SSE2(width, height, src, src_pitch, dst, dst_pitch);
} else {
int x, y; int x, y;
const int YUVwidth = (width + 1) / 2; const int YUVwidth = (width + 1) / 2;
const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4); const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4);
const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4); const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4);
const Uint8 *srcYUV = (const Uint8 *)src; const Uint8 *srcYUV = (const Uint8 *)src;
Uint8 *dstYUV = (Uint8 *)dst; Uint8 *dstYUV = (Uint8 *)dst;
#if HAVE_SSE2_INTRINSICS
const SDL_bool use_SSE2 = SDL_HasSSE2();
#endif
y = height; y = height;
while (y--) { while (y--) {
x = YUVwidth; x = YUVwidth;
#if HAVE_SSE2_INTRINSICS
if (use_SSE2) {
PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(1, 2, 3, 0));
}
#endif
while (x--) { while (x--) {
Uint8 Y1, U, Y2, V; Uint8 Y1, U, Y2, V;
@ -1471,28 +1826,24 @@ static int SDL_ConvertPixels_YUY2_to_YVYU(int width, int height, const void *src
dstYUV += dstYUVPitchLeft; dstYUV += dstYUVPitchLeft;
} }
return 0; return 0;
}
} }
static int SDL_ConvertPixels_UYVY_to_YUY2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) static int SDL_ConvertPixels_UYVY_to_YUY2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
{ {
if (SDL_HasSSE2()) {
return SDL_ConvertPixels_UYVY_to_YUY2_SSE2(width, height, src, src_pitch, dst, dst_pitch);
} else {
int x, y; int x, y;
const int YUVwidth = (width + 1) / 2; const int YUVwidth = (width + 1) / 2;
const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4); const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4);
const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4); const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4);
const Uint8 *srcYUV = (const Uint8 *)src; const Uint8 *srcYUV = (const Uint8 *)src;
Uint8 *dstYUV = (Uint8 *)dst; Uint8 *dstYUV = (Uint8 *)dst;
#if HAVE_SSE2_INTRINSICS
const SDL_bool use_SSE2 = SDL_HasSSE2();
#endif
y = height; y = height;
while (y--) { while (y--) {
x = YUVwidth; x = YUVwidth;
#if HAVE_SSE2_INTRINSICS
if (use_SSE2) {
PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(2, 3, 0, 1));
}
#endif
while (x--) { while (x--) {
Uint8 Y1, U, Y2, V; Uint8 Y1, U, Y2, V;
@ -1512,28 +1863,24 @@ static int SDL_ConvertPixels_UYVY_to_YUY2(int width, int height, const void *src
dstYUV += dstYUVPitchLeft; dstYUV += dstYUVPitchLeft;
} }
return 0; return 0;
}
} }
static int SDL_ConvertPixels_UYVY_to_YVYU(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) static int SDL_ConvertPixels_UYVY_to_YVYU(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
{ {
if (SDL_HasSSE2()) {
return SDL_ConvertPixels_UYVY_to_YVYU_SSE2(width, height, src, src_pitch, dst, dst_pitch);
} else {
int x, y; int x, y;
const int YUVwidth = (width + 1) / 2; const int YUVwidth = (width + 1) / 2;
const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4); const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4);
const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4); const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4);
const Uint8 *srcYUV = (const Uint8 *)src; const Uint8 *srcYUV = (const Uint8 *)src;
Uint8 *dstYUV = (Uint8 *)dst; Uint8 *dstYUV = (Uint8 *)dst;
#if HAVE_SSE2_INTRINSICS
const SDL_bool use_SSE2 = SDL_HasSSE2();
#endif
y = height; y = height;
while (y--) { while (y--) {
x = YUVwidth; x = YUVwidth;
#if HAVE_SSE2_INTRINSICS
if (use_SSE2) {
PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(0, 3, 2, 1));
}
#endif
while (x--) { while (x--) {
Uint8 Y1, U, Y2, V; Uint8 Y1, U, Y2, V;
@ -1553,28 +1900,24 @@ static int SDL_ConvertPixels_UYVY_to_YVYU(int width, int height, const void *src
dstYUV += dstYUVPitchLeft; dstYUV += dstYUVPitchLeft;
} }
return 0; return 0;
}
} }
static int SDL_ConvertPixels_YVYU_to_YUY2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) static int SDL_ConvertPixels_YVYU_to_YUY2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
{ {
if (SDL_HasSSE2()) {
return SDL_ConvertPixels_YVYU_to_YUY2_SSE2(width, height, src, src_pitch, dst, dst_pitch);
} else {
int x, y; int x, y;
const int YUVwidth = (width + 1) / 2; const int YUVwidth = (width + 1) / 2;
const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4); const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4);
const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4); const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4);
const Uint8 *srcYUV = (const Uint8 *)src; const Uint8 *srcYUV = (const Uint8 *)src;
Uint8 *dstYUV = (Uint8 *)dst; Uint8 *dstYUV = (Uint8 *)dst;
#if HAVE_SSE2_INTRINSICS
const SDL_bool use_SSE2 = SDL_HasSSE2();
#endif
y = height; y = height;
while (y--) { while (y--) {
x = YUVwidth; x = YUVwidth;
#if HAVE_SSE2_INTRINSICS
if (use_SSE2) {
PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(1, 2, 3, 0));
}
#endif
while (x--) { while (x--) {
Uint8 Y1, U, Y2, V; Uint8 Y1, U, Y2, V;
@ -1594,28 +1937,24 @@ static int SDL_ConvertPixels_YVYU_to_YUY2(int width, int height, const void *src
dstYUV += dstYUVPitchLeft; dstYUV += dstYUVPitchLeft;
} }
return 0; return 0;
}
} }
static int SDL_ConvertPixels_YVYU_to_UYVY(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch) static int SDL_ConvertPixels_YVYU_to_UYVY(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
{ {
if (SDL_HasSSE2()) {
return SDL_ConvertPixels_YVYU_to_UYVY_SSE2(width, height, src, src_pitch, dst, dst_pitch);
} else {
int x, y; int x, y;
const int YUVwidth = (width + 1) / 2; const int YUVwidth = (width + 1) / 2;
const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4); const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4);
const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4); const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4);
const Uint8 *srcYUV = (const Uint8 *)src; const Uint8 *srcYUV = (const Uint8 *)src;
Uint8 *dstYUV = (Uint8 *)dst; Uint8 *dstYUV = (Uint8 *)dst;
#if HAVE_SSE2_INTRINSICS
const SDL_bool use_SSE2 = SDL_HasSSE2();
#endif
y = height; y = height;
while (y--) { while (y--) {
x = YUVwidth; x = YUVwidth;
#if HAVE_SSE2_INTRINSICS
if (use_SSE2) {
PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(2, 1, 0, 3));
}
#endif
while (x--) { while (x--) {
Uint8 Y1, U, Y2, V; Uint8 Y1, U, Y2, V;
@ -1635,6 +1974,7 @@ static int SDL_ConvertPixels_YVYU_to_UYVY(int width, int height, const void *src
dstYUV += dstYUVPitchLeft; dstYUV += dstYUVPitchLeft;
} }
return 0; return 0;
}
} }
static int SDL_ConvertPixels_Packed4_to_Packed4(int width, int height, static int SDL_ConvertPixels_Packed4_to_Packed4(int width, int height,

View File

@ -609,7 +609,7 @@ V = _mm_srai_epi16(V, PRECISION);
SAVE_SI128((__m128i*)(u_ptr), u1); \ SAVE_SI128((__m128i*)(u_ptr), u1); \
SAVE_SI128((__m128i*)(v_ptr), v1); SAVE_SI128((__m128i*)(v_ptr), v1);
void rgb24_yuv420_sse(uint32_t width, uint32_t height, void SDL_TARGETING("sse2") rgb24_yuv420_sse(uint32_t width, uint32_t height,
const uint8_t *RGB, uint32_t RGB_stride, const uint8_t *RGB, uint32_t RGB_stride,
uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
YCbCrType yuv_type) YCbCrType yuv_type)
@ -645,7 +645,7 @@ void rgb24_yuv420_sse(uint32_t width, uint32_t height,
#undef SAVE_SI128 #undef SAVE_SI128
} }
void rgb24_yuv420_sseu(uint32_t width, uint32_t height, void SDL_TARGETING("sse2") rgb24_yuv420_sseu(uint32_t width, uint32_t height,
const uint8_t *RGB, uint32_t RGB_stride, const uint8_t *RGB, uint32_t RGB_stride,
uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
YCbCrType yuv_type) YCbCrType yuv_type)

View File

@ -382,7 +382,7 @@ PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6)
\ \
void SSE_FUNCTION_NAME(uint32_t width, uint32_t height, void SDL_TARGETING("sse2") SSE_FUNCTION_NAME(uint32_t width, uint32_t height,
const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
uint8_t *RGB, uint32_t RGB_stride, uint8_t *RGB, uint32_t RGB_stride,
YCbCrType yuv_type) YCbCrType yuv_type)