Fixed alpha blending for the MMX blit functions
I see the Remarks of function SDL_BlitSurface shows that "when SDL_BLENDMODE_BLEND, we have dstA = srcA + (dstA * (1-srcA))". however, I tested some pictures but the result implies "dstA=arcA" actually. I stepped into the source code, and found after I set SDL_BLENDMODE_BLEND for the source surface, the final blit function is BlitRGBtoRGBPixelAlphaMMX when I use SDL_BlitSurface on my computer. And I found these codes: else if (alpha == amask) { /* opaque alpha -- copy RGB, keep dst alpha */ *dstp = (*srcp & chanmask) | (*dstp & ~chanmask); The same code is used in BlitRGBtoRGBPixelAlphaMMX3DNOW and BlitRGBtoRGBPixelAlpha. So I think they still keep dst alpha. Best regards, Jianyu Guan
parent
65728477af
commit
89bc80f1ae
|
@ -337,15 +337,14 @@ BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
|
|||
Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
|
||||
Uint32 amask = sf->Amask;
|
||||
Uint32 ashift = sf->Ashift;
|
||||
Uint64 multmask;
|
||||
Uint64 multmask, multmask2;
|
||||
|
||||
__m64 src1, dst1, mm_alpha, mm_zero, dmask;
|
||||
__m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
|
||||
|
||||
mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
|
||||
multmask = 0xFFFF;
|
||||
multmask <<= (ashift * 2);
|
||||
multmask = ~multmask;
|
||||
dmask = *(__m64 *) & multmask; /* dst alpha mask -> dmask */
|
||||
multmask = 0x00FF;
|
||||
multmask <<= (ashift * 2);
|
||||
multmask2 = 0x00FF00FF00FF00FF;
|
||||
|
||||
while (height--) {
|
||||
/* *INDENT-OFF* */
|
||||
|
@ -353,9 +352,8 @@ BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
|
|||
Uint32 alpha = *srcp & amask;
|
||||
if (alpha == 0) {
|
||||
/* do nothing */
|
||||
} else if (alpha == amask) {
|
||||
/* opaque alpha -- copy RGB, keep dst alpha */
|
||||
*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
|
||||
} else if (alpha == amask || (*dstp & amask) == 0) {
|
||||
*dstp = *srcp;
|
||||
} else {
|
||||
src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
|
||||
src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
|
||||
|
@ -366,15 +364,17 @@ BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
|
|||
mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
|
||||
mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
|
||||
mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
|
||||
mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
|
||||
mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
|
||||
mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
|
||||
mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha*/
|
||||
mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha*/
|
||||
|
||||
/* blend */
|
||||
src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
|
||||
src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
|
||||
src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
|
||||
dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
|
||||
dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
|
||||
src1 = _mm_mullo_pi16(src1, mm_alpha);
|
||||
src1 = _mm_srli_pi16(src1, 8);
|
||||
dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
|
||||
dst1 = _mm_srli_pi16(dst1, 8);
|
||||
dst1 = _mm_add_pi16(src1, dst1);
|
||||
dst1 = _mm_packs_pu16(dst1, mm_zero);
|
||||
|
||||
*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
|
||||
}
|
||||
|
@ -481,23 +481,24 @@ BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
|
|||
compositioning used (>>8 instead of /255) doesn't handle
|
||||
it correctly. Also special-case alpha=0 for speed?
|
||||
Benchmark this! */
|
||||
if(alpha) {
|
||||
if(alpha == SDL_ALPHA_OPAQUE) {
|
||||
*dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
|
||||
if (alpha) {
|
||||
if (alpha == SDL_ALPHA_OPAQUE) {
|
||||
*dstp = *srcp;
|
||||
} else {
|
||||
/*
|
||||
* take out the middle component (green), and process
|
||||
* the other two in parallel. One multiply less.
|
||||
*/
|
||||
d = *dstp;
|
||||
dalpha = d & 0xff000000;
|
||||
dalpha = d >> 24;
|
||||
s1 = s & 0xff00ff;
|
||||
d1 = d & 0xff00ff;
|
||||
d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
|
||||
s &= 0xff00;
|
||||
d &= 0xff00;
|
||||
d = (d + ((s - d) * alpha >> 8)) & 0xff00;
|
||||
*dstp = d1 | d | dalpha;
|
||||
dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
|
||||
*dstp = d1 | d | (dalpha << 24);
|
||||
}
|
||||
}
|
||||
++srcp;
|
||||
|
@ -524,15 +525,14 @@ BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
|
|||
Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
|
||||
Uint32 amask = sf->Amask;
|
||||
Uint32 ashift = sf->Ashift;
|
||||
Uint64 multmask;
|
||||
Uint64 multmask, multmask2;
|
||||
|
||||
__m64 src1, dst1, mm_alpha, mm_zero, dmask;
|
||||
__m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
|
||||
|
||||
mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
|
||||
multmask = 0xFFFF;
|
||||
multmask = 0x00FF;
|
||||
multmask <<= (ashift * 2);
|
||||
multmask = ~multmask;
|
||||
dmask = *(__m64 *) & multmask; /* dst alpha mask -> dmask */
|
||||
multmask2 = 0x00FF00FF00FF00FF;
|
||||
|
||||
while (height--) {
|
||||
/* *INDENT-OFF* */
|
||||
|
@ -545,9 +545,8 @@ BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
|
|||
alpha = *srcp & amask;
|
||||
if (alpha == 0) {
|
||||
/* do nothing */
|
||||
} else if (alpha == amask) {
|
||||
/* copy RGB, keep dst alpha */
|
||||
*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
|
||||
} else if (alpha == amask || (*dstp & amask) == 0) {
|
||||
*dstp = *srcp;
|
||||
} else {
|
||||
src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
|
||||
src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
|
||||
|
@ -558,15 +557,18 @@ BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
|
|||
mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
|
||||
mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
|
||||
mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
|
||||
mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
|
||||
mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
|
||||
mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
|
||||
mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha*/
|
||||
mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha*/
|
||||
|
||||
|
||||
/* blend */
|
||||
src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
|
||||
src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
|
||||
src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
|
||||
dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
|
||||
dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
|
||||
src1 = _mm_mullo_pi16(src1, mm_alpha);
|
||||
src1 = _mm_srli_pi16(src1, 8);
|
||||
dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
|
||||
dst1 = _mm_srli_pi16(dst1, 8);
|
||||
dst1 = _mm_add_pi16(src1, dst1);
|
||||
dst1 = _mm_packs_pu16(dst1, mm_zero);
|
||||
|
||||
*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue