Add RGBA5551, RGBA4444 support to ImageDraw

This commit is contained in:
doyle 2020-07-07 22:24:31 +10:00
parent 05f47d2841
commit eace25611c

View File

@ -203,7 +203,12 @@ void RaylibSIMD_ImageDraw(Image *dst, Image src, Rectangle srcRec, Rectangle dst
RaylibSIMD_ImageDrawMode draw_mode = RaylibSIMD_ImageDrawMode_Original; RaylibSIMD_ImageDrawMode draw_mode = RaylibSIMD_ImageDrawMode_Original;
if (dst->format == UNCOMPRESSED_R8G8B8A8 && if (dst->format == UNCOMPRESSED_R8G8B8A8 &&
(srcPtr->format == UNCOMPRESSED_R8G8B8A8 || srcPtr->format == UNCOMPRESSED_R8G8B8 || srcPtr->format == UNCOMPRESSED_R5G6B5)) (srcPtr->format == UNCOMPRESSED_R8G8B8A8 ||
srcPtr->format == UNCOMPRESSED_R8G8B8 ||
srcPtr->format == UNCOMPRESSED_R5G6B5 ||
srcPtr->format == UNCOMPRESSED_R5G5B5A1 ||
srcPtr->format == UNCOMPRESSED_R4G4B4A4
))
{ {
draw_mode = RaylibSIMD_ImageDrawMode_SIMD; draw_mode = RaylibSIMD_ImageDrawMode_SIMD;
} }
@ -389,6 +394,84 @@ void RaylibSIMD_ImageDraw(Image *dst, Image src, Rectangle srcRec, Rectangle dst
src_g_to_01_space_coefficient = _mm_set1_ps(1.f/63.f); src_g_to_01_space_coefficient = _mm_set1_ps(1.f/63.f);
src_b_to_01_space_coefficient = _mm_set1_ps(1.f/31.f); src_b_to_01_space_coefficient = _mm_set1_ps(1.f/31.f);
} }
else if (srcPtr->format == UNCOMPRESSED_R5G5B5A1)
{
// NOTE: For a 128bit SIMD register with 4 lanes of 32 bits,
// we can store 2 pixels per register.
//
// RGBA5551 Pixel
// Bits | 01234 | 56789 | 10 11 12 13 14 | 15
// Color Bits | RRRRR | GGGGG | B B B B B | A
//
// Register | {[P1, P2] [P3, P4] [P5, P6] [P7, P8]}
//
// See UNCOMPRESSED_R8G8B8 for reason for shuffle. Our
// desired pattern is 1 pixel per lane for color blending in
// [0, 1] normalized 32bit float space.
//
// Register | {[P1] [P2] [P3] [P4]}
// Bits | [0:15] [16:31] [32:46] [46:61]
// Bytes | [0:1] [2:3] [4:5] [6:7]
r_bit_shift = 11;
g_bit_shift = 6;
b_bit_shift = 1;
a_bit_shift = 0;
r_mask_4x = _mm_set1_epi32(0b11111);
g_mask_4x = _mm_set1_epi32(0b11111);
b_mask_4x = _mm_set1_epi32(0b11111);
a_mask_4x = _mm_set1_epi32(0b00001);
src_pixels_shuffle = _mm_setr_epi8(0, 1, 0, 1, // Lane 1
2, 3, 2, 3,
4, 5, 4, 5,
6, 7, 6, 7);
src_r_to_01_space_coefficient = _mm_set1_ps(1.f/31.f);
src_g_to_01_space_coefficient = _mm_set1_ps(1.f/31.f);
src_b_to_01_space_coefficient = _mm_set1_ps(1.f/31.f);
src_a_to_01_space_coefficient = _mm_set1_ps(1.f);
}
else if (srcPtr->format == UNCOMPRESSED_R4G4B4A4)
{
// NOTE: For a 128bit SIMD register with 4 lanes of 32 bits,
// we can store 2 16bit pixels per register.
//
// RGBA5551 Pixel
// Bits | 0123 | 4567 | 89 10 11 | 12 13 14 15
// Color Bits | RRRR | GGGG | BB B B | A A A A
//
// Register | {[P1, P2] [P3, P4] [P5, P6] [P7, P8]}
//
// See UNCOMPRESSED_R8G8B8 for reason for shuffle. Our
// desired pattern is 1 pixel per lane for color blending in
// [0, 1] normalized 32bit float space.
//
// Register | {[P1] [P2] [P3] [P4]}
// Bits | [0:15] [16:31] [32:46] [46:61]
// Bytes | [0:1] [2:3] [4:5] [6:7]
r_bit_shift = 12;
g_bit_shift = 8;
b_bit_shift = 4;
a_bit_shift = 0;
r_mask_4x = _mm_set1_epi32(0b1111);
g_mask_4x = _mm_set1_epi32(0b1111);
b_mask_4x = _mm_set1_epi32(0b1111);
a_mask_4x = _mm_set1_epi32(0b1111);
src_pixels_shuffle = _mm_setr_epi8(0, 1, 0, 1, // Lane 1
2, 3, 2, 3,
4, 5, 4, 5,
6, 7, 6, 7);
src_r_to_01_space_coefficient = _mm_set1_ps(1.f/15.f);
src_g_to_01_space_coefficient = _mm_set1_ps(1.f/15.f);
src_b_to_01_space_coefficient = _mm_set1_ps(1.f/15.f);
src_a_to_01_space_coefficient = _mm_set1_ps(1.f/15.f);
}
__m128 const src_alpha_min_4x = _mm_set1_ps(src_alpha_min); __m128 const src_alpha_min_4x = _mm_set1_ps(src_alpha_min);
@ -447,8 +530,8 @@ void RaylibSIMD_ImageDraw(Image *dst, Image src, Rectangle srcRec, Rectangle dst
__m128 src0123_b = _mm_cvtepi32_ps(src0123_b_int); __m128 src0123_b = _mm_cvtepi32_ps(src0123_b_int);
__m128 src0123_a = _mm_cvtepi32_ps(src0123_a_int); __m128 src0123_a = _mm_cvtepi32_ps(src0123_a_int);
// NOTE: For 3BPP Images the src_alpha_min_4x is set to 255 to completely overwrite dest. // NOTE: For images without an alpha component the src_alpha_min_4x is set to 255 to completely overwrite dest.
// For 4BPP Images the src_alpha_min_4x is set to 0 (i.e. no-op) // For images with an alpha component the src_alpha_min_4x is set to 0 (i.e. no-op)
src0123_a = _mm_max_ps(src0123_a, src_alpha_min_4x); src0123_a = _mm_max_ps(src0123_a, src_alpha_min_4x);
__m128 dest0123_r = _mm_cvtepi32_ps(dest0123_r_int); __m128 dest0123_r = _mm_cvtepi32_ps(dest0123_r_int);