From 0eaa52cedf6b63ed65768f88bb335db7aca40685 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Thu, 24 Oct 2019 21:13:56 -0400
Subject: [PATCH] ARM: SIMD assembly optimization for function
 BlitARGBto565PixelAlpha

---
 src/video/SDL_blit_A.c              |  24 ++++
 src/video/arm/pixman-arm-simd-asm.S | 197 ++++++++++++++++++++++++++++
 2 files changed, 221 insertions(+)

diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c
index 07dd98001..a1d087d4e 100644
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -390,6 +390,21 @@ BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
 #endif /* __MMX__ */
 
 #if SDL_ARM_SIMD_BLITTERS
+void BlitARGBto565PixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
+
+static void
+BlitARGBto565PixelAlphaARMSIMD(SDL_BlitInfo * info)
+{
+	int32_t width = info->dst_w;
+	int32_t height = info->dst_h;
+	uint16_t *dstp = (uint16_t *)info->dst;
+	int32_t dststride = width + (info->dst_skip >> 1);
+	uint32_t *srcp = (uint32_t *)info->src;
+	int32_t srcstride = width + (info->src_skip >> 2);
+
+	BlitARGBto565PixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
+}
+
 void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
 
 static void
@@ -1301,6 +1316,15 @@ SDL_CalculateBlitA(SDL_Surface * surface)
             }
 
         case 2:
+#if SDL_ARM_SIMD_BLITTERS
+                if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
+                    && sf->Gmask == 0xff00 && df->Gmask == 0x7e0
+                    && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
+                    || (sf->Bmask == 0xff && df->Bmask == 0x1f))
+                    && SDL_HasARMSIMD())
+                        return BlitARGBto565PixelAlphaARMSIMD;
+                else
+#endif
                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
                     && sf->Gmask == 0xff00
                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
diff --git a/src/video/arm/pixman-arm-simd-asm.S b/src/video/arm/pixman-arm-simd-asm.S
index 2d65887e5..6dcbbe5e2 100644
--- a/src/video/arm/pixman-arm-simd-asm.S
+++ b/src/video/arm/pixman-arm-simd-asm.S
@@ -166,3 +166,200 @@ generate_composite_function \
     RGBtoRGBPixelAlpha_process_tail
 
 /******************************************************************************/
+
+.macro ARGBto565PixelAlpha_init
+        line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
+        mov     MASK, #0x001f
+        mov     STRIDE_M, #0x0010
+        orr     MASK, MASK, MASK, lsl #16
+        orr     STRIDE_M, STRIDE_M, STRIDE_M, lsl #16
+.endm
+
+.macro ARGBto565PixelAlpha_newline
+        mov     STRIDE_S, #0x0200
+.endm
+
+/* On entry:
+ * s1 holds 1 32bpp source pixel
+ * d holds 1 16bpp destination pixel
+ * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively
+ * other registers are temporaries
+ * On exit:
+ * Constant registers preserved
+ */
+
+.macro ARGBto565PixelAlpha_1pixel_translucent  s, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc
+        mov     alpha, s, lsr #27
+        and     misc, s, #0xfc00
+        and     g, d, #0x07e0
+        pkhbt   rb, d, d, lsl #5
+        rsb     misc, g, misc, lsr #5
+        and     s, rbmask, s, lsr #3
+        and     rb, rbmask, rb
+        sub     s, s, rb
+        smlabb  misc, misc, alpha, ghalf
+        mla     s, s, alpha, rbhalf
+        add     misc, misc, misc, lsl #5
+        add     g, g, misc, asr #10
+        add     s, s, s, lsl #5
+        and     g, g, #0x07e0
+        add     rb, rb, s, asr #10
+        and     rb, rb, rbmask
+        pkhbt   rb, rb, rb, lsl #11
+        orr     d, rb, g
+        orr     d, d, rb, lsr #16
+.endm
+
+/* On entry:
+ * s1 holds 1 32bpp source pixel
+ * d holds 1 16bpp destination pixel
+ * rbmask holds 0x001f001f
+ * On exit:
+ * Constant registers preserved
+ */
+
+.macro ARGBto565PixelAlpha_1pixel_opaque  s, d, rbmask
+        and     d, rbmask, s, lsr #3
+        and     s, s, #0xfc00
+        orr     d, d, d, lsr #5
+        orr     d, d, s, lsr #5
+.endm
+
+/* On entry:
+ * s1, s2 hold 2 32bpp source pixels
+ * d holds 2 16bpp destination pixels
+ * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively
+ * other registers are temporaries
+ * On exit:
+ * Constant registers preserved
+ * Blended results have been written through destination pointer
+ */
+
+.macro ARGBto565PixelAlpha_2pixels_translucent  s1, s2, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc
+        mov     alpha, s1, lsr #27
+        and     misc, s1, #0xfc00
+        and     g, d, #0x07e0
+        pkhbt   rb, d, d, lsl #5
+        rsb     misc, g, misc, lsr #5
+        and     s1, rbmask, s1, lsr #3
+        and     rb, rbmask, rb
+        sub     s1, s1, rb
+        smlabb  misc, misc, alpha, ghalf
+        mla     s1, s1, alpha, rbhalf
+          uxth    d, d, ror #16
+        add     misc, misc, misc, lsl #5
+          mov     alpha, s2, lsr #27
+        add     g, g, misc, asr #10
+        add     s1, s1, s1, lsl #5
+        and     g, g, #0x07e0
+        add     rb, rb, s1, asr #10
+        and     rb, rb, rbmask
+          and     misc, s2, #0xfc00
+        pkhbt   rb, rb, rb, lsl #11
+          and     s1, d, #0x07e0
+          pkhbt   d, d, d, lsl #5
+          rsb     misc, s1, misc, lsr #5
+          and     s2, rbmask, s2, lsr #3
+          and     d, rbmask, d
+          sub     s2, s2, d
+          smlabb  misc, misc, alpha, ghalf
+          mla     s2, s2, alpha, rbhalf
+        orr     alpha, rb, g
+          add     misc, misc, misc, lsl #5
+        orr     alpha, alpha, rb, lsr #16
+          add     s1, s1, misc, asr #10
+          add     s2, s2, s2, lsl #5
+          and     s1, s1, #0x07e0
+          add     d, d, s2, asr #10
+          and     d, d, rbmask
+        strh    alpha, [DST, #-4]
+          pkhbt   d, d, d, lsl #11
+          orr     alpha, d, s1
+          orr     alpha, alpha, d, lsr #16
+          strh    alpha, [DST, #-2]
+.endm
+
+/* On entry:
+ * s1, s2 hold 2 32bpp source pixels
+ * rbmask holds 0x001f001f
+ * other registers are temporaries
+ * On exit:
+ * Constant registers preserved
+ * Blended results have been written through destination pointer
+ */
+
+.macro ARGBto565PixelAlpha_2pixels_opaque  s1, s2, d, rbmask, g
+        and     g, s1, #0xfc00
+        and     d, rbmask, s1, lsr #3
+          and     s1, rbmask, s2, lsr #3
+        orr     d, d, d, lsr #5
+        orr     d, d, g, lsr #5
+          and     g, s2, #0xfc00
+        strh    d, [DST, #-4]
+          orr     s1, s1, s1, lsr #5
+          orr     s1, s1, g, lsr #5
+          strh    s1, [DST, #-2]
+.endm
+
+.macro ARGBto565PixelAlpha_2pixels_head
+        ldrd    WK0, WK1, [SRC], #8
+        ldr     WK2, [DST], #4
+        orr     SCRATCH, WK0, WK1
+        and     ORIG_W, WK0, WK1
+        tst     SCRATCH, #0xff000000
+.endm
+
+.macro ARGBto565PixelAlpha_2pixels_tail
+        beq     20f @ all transparent
+        cmp     ORIG_W, #0xff000000
+        bhs     10f @ all opaque
+        ARGBto565PixelAlpha_2pixels_translucent  WK0, WK1, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W
+        b       20f
+10:     ARGBto565PixelAlpha_2pixels_opaque  WK0, WK1, WK2, MASK, SCRATCH
+20:
+.endm
+
+.macro ARGBto565PixelAlpha_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 16
+        ARGBto565PixelAlpha_2pixels_head
+        ARGBto565PixelAlpha_2pixels_tail
+        ARGBto565PixelAlpha_2pixels_head
+        ARGBto565PixelAlpha_2pixels_tail
+ .endif
+ .if numbytes >= 8
+        ARGBto565PixelAlpha_2pixels_head
+        ARGBto565PixelAlpha_2pixels_tail
+ .endif
+ .if numbytes >= 4
+        ARGBto565PixelAlpha_2pixels_head
+ .else // numbytes == 2
+        ldr     WK0, [SRC], #4
+        ldrh    WK2, [DST], #2
+        tst     WK0, #0xff000000
+ .endif
+.endm
+
+.macro ARGBto565PixelAlpha_process_tail  cond, numbytes, firstreg
+ .if numbytes >= 4
+        ARGBto565PixelAlpha_2pixels_tail
+ .else // numbytes == 2
+        beq     20f @ all transparent
+        cmp     WK0, #0xff000000
+        bhs     10f @ opaque
+        ARGBto565PixelAlpha_1pixel_translucent  WK0, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W
+        b       19f
+10:     ARGBto565PixelAlpha_1pixel_opaque WK0, WK2, MASK
+19:     strh    WK2, [DST, #-2]
+20:
+ .endif
+.endm
+
+generate_composite_function \
+    BlitARGBto565PixelAlphaARMSIMDAsm, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+    2, /* prefetch distance */ \
+    ARGBto565PixelAlpha_init, \
+    ARGBto565PixelAlpha_newline, \
+    nop_macro, /* cleanup */ \
+    ARGBto565PixelAlpha_process_head, \
+    ARGBto565PixelAlpha_process_tail