Add RGBA5551, RGBA4444 support to ImageDraw

2020-07-07 22:24:31 +10:00 · 2020-07-07 22:24:31 +10:00 · eace25611c
commit eace25611c
parent 05f47d2841
1 changed files with 86 additions and 3 deletions
--- a/RaylibSIMD.h
+++ b/RaylibSIMD.h
@ -203,7 +203,12 @@ void RaylibSIMD_ImageDraw(Image *dst, Image src, Rectangle srcRec, Rectangle dst
        RaylibSIMD_ImageDrawMode draw_mode = RaylibSIMD_ImageDrawMode_Original;

        if (dst->format == UNCOMPRESSED_R8G8B8A8 &&
-            (srcPtr->format == UNCOMPRESSED_R8G8B8A8 || srcPtr->format == UNCOMPRESSED_R8G8B8 || srcPtr->format == UNCOMPRESSED_R5G6B5))
+            (srcPtr->format == UNCOMPRESSED_R8G8B8A8 ||
+             srcPtr->format == UNCOMPRESSED_R8G8B8 ||
+             srcPtr->format == UNCOMPRESSED_R5G6B5 ||
+             srcPtr->format == UNCOMPRESSED_R5G5B5A1 ||
+             srcPtr->format == UNCOMPRESSED_R4G4B4A4
+             ))
        {
            draw_mode = RaylibSIMD_ImageDrawMode_SIMD;
        }
@ -389,6 +394,84 @@ void RaylibSIMD_ImageDraw(Image *dst, Image src, Rectangle srcRec, Rectangle dst
                    src_g_to_01_space_coefficient = _mm_set1_ps(1.f/63.f);
                    src_b_to_01_space_coefficient = _mm_set1_ps(1.f/31.f);
                }
+                else if (srcPtr->format == UNCOMPRESSED_R5G5B5A1)
+                {
+                    // NOTE: For a 128bit SIMD register with 4 lanes of 32 bits,
+                    // we can store 2 pixels per register.
+                    //
+                    // RGBA5551 Pixel
+                    // Bits       | 01234 | 56789 | 10 11 12 13 14 | 15
+                    // Color Bits | RRRRR | GGGGG |  B  B  B  B  B | A
+                    //
+                    // Register   | {[P1, P2] [P3, P4] [P5, P6] [P7, P8]}
+                    //
+                    // See UNCOMPRESSED_R8G8B8 for reason for shuffle. Our
+                    // desired pattern is 1 pixel per lane for color blending in
+                    // [0, 1] normalized 32bit float space.
+                    //
+                    // Register   | {[P1]   [P2]    [P3]    [P4]}
+                    // Bits       |  [0:15] [16:31] [32:46] [46:61]
+                    // Bytes      |  [0:1]  [2:3]   [4:5]   [6:7]
+
+                    r_bit_shift = 11;
+                    g_bit_shift = 6;
+                    b_bit_shift = 1;
+                    a_bit_shift = 0;
+
+                    r_mask_4x = _mm_set1_epi32(0b11111);
+                    g_mask_4x = _mm_set1_epi32(0b11111);
+                    b_mask_4x = _mm_set1_epi32(0b11111);
+                    a_mask_4x = _mm_set1_epi32(0b00001);
+
+                    src_pixels_shuffle = _mm_setr_epi8(0, 1, 0, 1, // Lane 1
+                                                       2, 3, 2, 3,
+                                                       4, 5, 4, 5,
+                                                       6, 7, 6, 7);
+
+                    src_r_to_01_space_coefficient = _mm_set1_ps(1.f/31.f);
+                    src_g_to_01_space_coefficient = _mm_set1_ps(1.f/31.f);
+                    src_b_to_01_space_coefficient = _mm_set1_ps(1.f/31.f);
+                    src_a_to_01_space_coefficient = _mm_set1_ps(1.f);
+                }
+                else if (srcPtr->format == UNCOMPRESSED_R4G4B4A4)
+                {
+                    // NOTE: For a 128bit SIMD register with 4 lanes of 32 bits,
+                    // we can store 2 16bit pixels per register.
+                    //
+                    // RGBA5551 Pixel
+                    // Bits       | 0123 | 4567 | 89 10 11 | 12 13 14 15
+                    // Color Bits | RRRR | GGGG | BB  B  B |  A  A  A  A
+                    //
+                    // Register   | {[P1, P2] [P3, P4] [P5, P6] [P7, P8]}
+                    //
+                    // See UNCOMPRESSED_R8G8B8 for reason for shuffle. Our
+                    // desired pattern is 1 pixel per lane for color blending in
+                    // [0, 1] normalized 32bit float space.
+                    //
+                    // Register   | {[P1]   [P2]    [P3]    [P4]}
+                    // Bits       |  [0:15] [16:31] [32:46] [46:61]
+                    // Bytes      |  [0:1]  [2:3]   [4:5]   [6:7]
+
+                    r_bit_shift = 12;
+                    g_bit_shift = 8;
+                    b_bit_shift = 4;
+                    a_bit_shift = 0;
+
+                    r_mask_4x = _mm_set1_epi32(0b1111);
+                    g_mask_4x = _mm_set1_epi32(0b1111);
+                    b_mask_4x = _mm_set1_epi32(0b1111);
+                    a_mask_4x = _mm_set1_epi32(0b1111);
+
+                    src_pixels_shuffle = _mm_setr_epi8(0, 1, 0, 1, // Lane 1
+                                                       2, 3, 2, 3,
+                                                       4, 5, 4, 5,
+                                                       6, 7, 6, 7);
+
+                    src_r_to_01_space_coefficient = _mm_set1_ps(1.f/15.f);
+                    src_g_to_01_space_coefficient = _mm_set1_ps(1.f/15.f);
+                    src_b_to_01_space_coefficient = _mm_set1_ps(1.f/15.f);
+                    src_a_to_01_space_coefficient = _mm_set1_ps(1.f/15.f);
+                }

                __m128 const src_alpha_min_4x = _mm_set1_ps(src_alpha_min);

@ -447,8 +530,8 @@ void RaylibSIMD_ImageDraw(Image *dst, Image src, Rectangle srcRec, Rectangle dst
                        __m128 src0123_b  = _mm_cvtepi32_ps(src0123_b_int);
                        __m128 src0123_a  = _mm_cvtepi32_ps(src0123_a_int);

-                        // NOTE: For 3BPP Images the src_alpha_min_4x is set to 255 to completely overwrite dest.
-                        //       For 4BPP Images the src_alpha_min_4x is set to 0 (i.e. no-op)
+                        // NOTE: For images without an alpha component the src_alpha_min_4x is set to 255 to completely overwrite dest.
+                        //       For images with an alpha component the src_alpha_min_4x is set to 0 (i.e. no-op)
                        src0123_a = _mm_max_ps(src0123_a, src_alpha_min_4x);

                        __m128 dest0123_r = _mm_cvtepi32_ps(dest0123_r_int);