Add ucs to utf8 conversion to and from

2017-04-09 22:35:54 +10:00 · 2017-04-09 22:35:54 +10:00 · 3e0064d741
commit 3e0064d741
parent 30e219e7c3
2 changed files with 239 additions and 59 deletions
--- a/dqnt.h
+++ b/dqnt.h
@ -162,6 +162,7 @@ DQNT_FILE_SCOPE DqntV2   dqnt_rect_get_size_v2(DqntRect rect);
 DQNT_FILE_SCOPE DqntV2   dqnt_rect_get_centre (DqntRect rect);
 DQNT_FILE_SCOPE DqntRect dqnt_rect_move       (DqntRect rect, DqntV2 shift);
 DQNT_FILE_SCOPE bool     dqnt_rect_contains_p (DqntRect rect, DqntV2 p);
 ////////////////////////////////////////////////////////////////////////////////
 // String Ops
 ////////////////////////////////////////////////////////////////////////////////
@ -179,9 +180,24 @@ DQNT_FILE_SCOPE bool  dqnt_str_reverse(char *buf, const i32 bufSize);
 DQNT_FILE_SCOPE i32   dqnt_str_to_i32 (char *const buf, const i32 bufSize);
 DQNT_FILE_SCOPE void  dqnt_i32_to_str (i32 value, char *buf, i32 bufSize);
-DQNT_FILE_SCOPE i32  dqnt_wstrcmp(const wchar_t *a, const wchar_t *b);
+// Both return the number of bytes read, return 0 if invalid codepoint or UTF8
-DQNT_FILE_SCOPE void dqnt_wstrcat(const wchar_t *a, i32 lenA, const wchar_t *b, i32 lenB, wchar_t *out, i32 outLen);
+DQNT_FILE_SCOPE u32 dqnt_ucs_to_utf8(u32 *dest, u32 character);
-DQNT_FILE_SCOPE i32  dqnt_wstrlen(const wchar_t *a);
+DQNT_FILE_SCOPE u32 dqnt_utf8_to_ucs(u32 *dest, u32 character);
 ////////////////////////////////////////////////////////////////////////////////
 // File Operations
 ////////////////////////////////////////////////////////////////////////////////
 typedef struct DqntFile
 {
 	void *handle;
 	u64   size;
 } DqntFile;
 #if 0
 bool platform_open_file (char *const file, PlatformFile *platformFile);
 // Return the number of bytes read
 u32  platform_read_file (PlatformFile file, void *buffer, u32 numBytesToRead);
 void platform_close_file(PlatformFile *file);
 #endif
 ////////////////////////////////////////////////////////////////////////////////
 // Timer
@ -728,7 +744,7 @@ DQNT_FILE_SCOPE inline bool dqnt_rect_contains_p(DqntRect rect, DqntV2 p)
 }
 ////////////////////////////////////////////////////////////////////////////////
-// String Ops
+// String Operations
 ////////////////////////////////////////////////////////////////////////////////
 DQNT_FILE_SCOPE bool dqnt_char_is_digit(char c)
 {
@ -870,27 +886,166 @@ DQNT_FILE_SCOPE void dqnt_i32_to_str(i32 value, char *buf, i32 bufSize)
 	}
 }
-DQNT_FILE_SCOPE i32 dqnt_wstrcmp(const wchar_t *a, const wchar_t *b)
+/*
-{
+	Encoding
-	if (!a && !b) return -1;
+	The following byte sequences are used to represent a character. The sequence
-	if (!a) return -1;
+	to be used depends on the UCS code number of the character:
 	if (!b) return -1;
-	while ((*a) == (*b))
+	The extra 1's are the headers used to identify the string as a UTF-8 string.
 	UCS [0x00000000, 0x0000007F] -> UTF-8 0xxxxxxx
 	UCS [0x00000080, 0x000007FF] -> UTF-8 110xxxxx 10xxxxxx
 	UCS [0x00000800, 0x0000FFFF] -> UTF-8 1110xxxx 10xxxxxx 10xxxxxx
 	UCS [0x00010000, 0x001FFFFF] -> UTF-8 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 	UCS [0x00200000, 0x03FFFFFF] -> N/A   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 	UCS [0x04000000, 0x7FFFFFFF] -> N/A   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 	The xxx bit positions are filled with the bits of the character code number
 	in binary representation. Only the shortest possible multibyte sequence
 	which can represent the code number of the character can be used.
 	The UCS code values 0xd800–0xdfff (UTF-16 surrogates) as well as 0xfffe and
 	0xffff (UCS noncharacters) should not appear in conforming UTF-8 streams.
 */
 DQNT_FILE_SCOPE u32 dqnt_ucs_to_utf8(u32 *dest, u32 character)
 {
 	if (!dest) return 0;
 	u8 *bytePtr = (u8 *)dest;
 	// Character is within ASCII range, so it's an ascii character
 	// UTF Bit Arrangement: 0xxxxxxx
 	// Character          : 0xxxxxxx
 	if (character >= 0 && character < 0x80)
 	{
-		if (!(*a)) return 0;
+		bytePtr[0] = (u8)character;
-		a++;
+		return 1;
 		b++;
 	}
-	return (((*a) < (*b)) ? -1 : 1);
+	// UTF Header Bits    : 11000000 00xxxxxx
 	// UTF Bit Arrangement: 000xxxxx 00xxxxxx
 	// Character          : 00000xxx xxxxxxxx
 	if (character < 0x800)
 	{
 		// Add the 2nd byte, 6 bits, OR the 0xC0 (11000000) header bits
 		bytePtr[1] = (u8)((character >> 6) | 0xC0);
 		// Add the 1st byte, 6 bits, plus the 0x80 (10000000) header bits
 		bytePtr[0] = (u8)((character & 0x3F) | 0x80);
 		return 2;
 	}
 	// UTF Header Bits     : 11100000 10000000 10000000
 	// UTF Bit Arrangement : 0000xxxx 00xxxxxx 00xxxxxx
 	// Character           : 00000000 xxxxxxxx xxxxxxxx
 	if (character < 0x10000)
 	{
 		// Add the 3rd byte, 4 bits, OR the 0xE0 (11100000) header bits
 		bytePtr[2] = (u8)((character >> 12) | 0xE0);
 		// Add the 2nd byte, 6 bits, OR the 0x80 (10000000) header bits
 		bytePtr[1] = (u8)((character >> 6) | 0x80);
 		// Add the 1st byte, 6 bits, plus the 0x80 (10000000) header bits
 		bytePtr[0] = (u8)((character & 0x3F) | 0x80);
 		return 3;
 	}
 	// UTF Header Bits     : 11110000 10000000 10000000 10000000
 	// UTF Bit Arrangement : 00000xxx 00xxxxxx 00xxxxxx 00xxxxxx
 	// Character           : 00000000 00000xxx xxxxxxxx xxxxxxxx
 	if (character < 0x110000)
 	{
 		// Add the 4th byte, 3 bits, OR the 0xF0 (11110000) header bits
 		bytePtr[3] = (u8)((character >> 18) | 0xF0);
 		// Add the 3rd byte, 6 bits, OR the 0x80 (10000000) header bits
 		bytePtr[2] = (u8)(((character >> 12) & 0x3F) | 0x80);
 		// Add the 2nd byte, 6 bits, plus the 0x80 (10000000) header bits
 		bytePtr[1] = (u8)(((character >> 6) & 0x3F) | 0x80);
 		// Add the 2nd byte, 6 bits, plus the 0x80 (10000000) header bits
 		bytePtr[0] = (u8)((character & 0x3F) | 0x80);
 		return 4;
 	}
 	return 0;
 }
-DQNT_FILE_SCOPE i32 dqnt_wstrlen(const wchar_t *a)
+DQNT_FILE_SCOPE u32 dqnt_utf8_to_ucs(u32 *dest, u32 character)
 {
-	i32 result = 0;
+	if (!dest) return 0;
-	while (a && a[result]) result++;
+
-	return result;
+	// UTF Header Bits     : 11110000 10000000 10000000 10000000
 	// UTF Bit Arrangement : 00000xxx 00xxxxxx 00xxxxxx 00xxxxxx
 	// UCS                 : 00000000 00000xxx xxxxxxxx xxxxxxxx
 	const u32 headerBits4Bytes = 0xF0808080;
 	if ((character & headerBits4Bytes) == headerBits4Bytes)
 	{
 		u32 utfWithoutHeader = headerBits4Bytes ^ character;
 		u32 firstByte  = utfWithoutHeader & 0x3F;
 		u32 secondByte = (utfWithoutHeader >> 8)  & 0x3F;
 		u32 thirdByte  = (utfWithoutHeader >> 16) & 0x3F;
 		u32 fourthByte = utfWithoutHeader >> 24;
 		u32 result =
 		    (fourthByte << 18 | thirdByte << 12 | secondByte << 6 | firstByte);
 		*dest = result;
 		return 4;
 	}
 	// UTF Header Bits     : 11100000 10000000 10000000
 	// UTF Bit Arrangement : 0000xxxx 00xxxxxx 00xxxxxx
 	// UCS                 : 00000000 xxxxxxxx xxxxxxxx
 	const u32 headerBits3Bytes = 0xE08080;
 	if ((character & headerBits3Bytes)  == headerBits3Bytes)
 	{
 		u32 utfWithoutHeader = headerBits3Bytes ^ character;
 		u32 firstByte  = utfWithoutHeader & 0x3F;
 		u32 secondByte = (utfWithoutHeader >> 8) & 0x3F;
 		u32 thirdByte  = utfWithoutHeader >> 16;
 		u32 result = (thirdByte << 12 | secondByte << 6 | firstByte);
 		*dest = result;
 		return 3;
 	}
 	// UTF Header Bits    : 11000000 00xxxxxx
 	// UTF Bit Arrangement: 000xxxxx 00xxxxxx
 	// UCS                : 00000xxx xxxxxxxx
 	const u32 headerBits2Bytes = 0xC000;
 	if ((character & headerBits2Bytes) == headerBits2Bytes)
 	{
 		u32 utfWithoutHeader = headerBits2Bytes ^ character;
 		u32 firstByte  = utfWithoutHeader & 0x3F;
 		u32 secondByte = utfWithoutHeader >> 8;
 		u32 result = (secondByte << 6 | firstByte);
 		*dest = result;
 		return 2;
 	}
 	// Character is within ASCII range, so it's an ascii character
 	// UTF Bit Arrangement: 0xxxxxxx
 	// UCS                : 0xxxxxxx
 	if (character >= 0x0 && character < 0x80)
 	{
 		u32 firstByte = (character & 0x3F);
 		*dest         = firstByte;
 		return 1;
 	}
 	return 0;
 }
 ////////////////////////////////////////////////////////////////////////////////
--- a/dqnt_unit_test.cpp
+++ b/dqnt_unit_test.cpp
@ -206,54 +206,79 @@ void dqnt_strings_test()
 			printf("dqnt_strings_test(): str_to_i32: Completed successfully\n");
 		}
-    }
+	}
-	// Wide String Checks
+    // UCS <-> UTF8 Checks
-	{
+    {
-		// wstrcmp
+	    // Test ascii characters
 		{
 			wchar_t *a = L"str_a";
 			// Check simple compares
 			{
 				DQNT_ASSERT(dqnt_wstrcmp(a, L"str_a") == +0);
 				DQNT_ASSERT(dqnt_wstrcmp(a, L"str_b") == -1);
 				DQNT_ASSERT(dqnt_wstrcmp(L"str_b", a) == +1);
 				DQNT_ASSERT(dqnt_wstrcmp(a, L"") == +1);
 				DQNT_ASSERT(dqnt_wstrcmp(L"", L"") == 0);
 				// NOTE: Check that the string has not been trashed.
 				DQNT_ASSERT(dqnt_wstrcmp(a, L"str_a") == +0);
 			}
 			// Check ops against null
 			{
 				DQNT_ASSERT(dqnt_wstrcmp(NULL, NULL) != +0);
 				DQNT_ASSERT(dqnt_wstrcmp(a, NULL) != +0);
 				DQNT_ASSERT(dqnt_wstrcmp(NULL, a) != +0);
 			}
 			printf("dqnt_strings_test(): wstrcmp: Completed successfully\n");
 		}
 	    // wstrlen
 	    {
-		    wchar_t *a = L"str_a";
+		    u32 codepoint = '@';
-		    DQNT_ASSERT(dqnt_wstrlen(a) == 5);
+		    u32 string[1] = {};
 		    DQNT_ASSERT(dqnt_wstrlen(L"") == 0);
 		    DQNT_ASSERT(dqnt_wstrlen(L"   a  ") == 6);
 		    DQNT_ASSERT(dqnt_wstrlen(L"a\n") == 2);
-		    // NOTE: Check that the string has not been trashed.
+		    u32 bytesUsed = dqnt_ucs_to_utf8(&string[0], codepoint);
-		    DQNT_ASSERT(dqnt_wstrcmp(a, L"str_a") == 0);
+		    DQNT_ASSERT(bytesUsed == 1);
 		    DQNT_ASSERT(string[0] == '@');
-		    DQNT_ASSERT(dqnt_wstrlen(NULL) == 0);
+		    bytesUsed = dqnt_utf8_to_ucs(&string[0], codepoint);
-
+		    DQNT_ASSERT(string[0] >= 0 && string[0] < 0x80);
-		    printf("dqnt_strings_test(): wstrlen: Completed successfully\n");
+		    DQNT_ASSERT(bytesUsed == 1);
 	    }
 	    // Test 2 byte characters
 		{
 		    u32 codepoint = 0x278;
 		    u32 string[1] = {};
 		    u32 bytesUsed = dqnt_ucs_to_utf8(&string[0], codepoint);
 		    DQNT_ASSERT(bytesUsed == 2);
 		    DQNT_ASSERT(string[0] == 0xC9B8);
 		    bytesUsed = dqnt_utf8_to_ucs(&string[0], string[0]);
 		    DQNT_ASSERT(string[0] == codepoint);
 		    DQNT_ASSERT(bytesUsed == 2);
 	    }
 	    // Test 3 byte characters
 		{
 		    u32 codepoint = 0x0A0A;
 		    u32 string[1] = {};
 			u32 bytesUsed = dqnt_ucs_to_utf8(&string[0], codepoint);
 		    DQNT_ASSERT(bytesUsed == 3);
 			DQNT_ASSERT(string[0] == 0xE0A88A);
 		    bytesUsed = dqnt_utf8_to_ucs(&string[0], string[0]);
 		    DQNT_ASSERT(string[0] == codepoint);
 		    DQNT_ASSERT(bytesUsed == 3);
 	    }
 	    // Test 4 byte characters
 		{
 		    u32 codepoint = 0x10912;
 		    u32 string[1] = {};
 			u32 bytesUsed = dqnt_ucs_to_utf8(&string[0], codepoint);
 		    DQNT_ASSERT(bytesUsed == 4);
 		    DQNT_ASSERT(string[0] == 0xF090A492);
 		    bytesUsed = dqnt_utf8_to_ucs(&string[0], string[0]);
 		    DQNT_ASSERT(string[0] == codepoint);
 		    DQNT_ASSERT(bytesUsed == 4);
 	    }
 		{
 		    u32 codepoint = 0x10912;
 			u32 bytesUsed = dqnt_ucs_to_utf8(NULL, codepoint);
 		    DQNT_ASSERT(bytesUsed == 0);
 		    bytesUsed = dqnt_utf8_to_ucs(NULL, codepoint);
 		    DQNT_ASSERT(bytesUsed == 0);
 	    }
 	    printf("dqnt_strings_test(): ucs <-> utf8: Completed successfully\n");
    }
-	printf("dqnt_strings_test(): Completed successfully\n");
+    printf("dqnt_strings_test(): Completed successfully\n");
 }
 #include "Windows.h"