Add ucs to utf8 conversion to and from

2017-04-09 22:35:54 +10:00 · 2017-04-09 22:35:54 +10:00 · 3e0064d741
commit 3e0064d741
parent 30e219e7c3
2 changed files with 239 additions and 59 deletions
--- a/dqnt.h
+++ b/dqnt.h
@ -162,6 +162,7 @@ DQNT_FILE_SCOPE DqntV2   dqnt_rect_get_size_v2(DqntRect rect);
 DQNT_FILE_SCOPE DqntV2   dqnt_rect_get_centre (DqntRect rect);
 DQNT_FILE_SCOPE DqntRect dqnt_rect_move       (DqntRect rect, DqntV2 shift);
 DQNT_FILE_SCOPE bool     dqnt_rect_contains_p (DqntRect rect, DqntV2 p);
+
 ////////////////////////////////////////////////////////////////////////////////
 // String Ops
 ////////////////////////////////////////////////////////////////////////////////
@ -179,9 +180,24 @@ DQNT_FILE_SCOPE bool  dqnt_str_reverse(char *buf, const i32 bufSize);
 DQNT_FILE_SCOPE i32   dqnt_str_to_i32 (char *const buf, const i32 bufSize);
 DQNT_FILE_SCOPE void  dqnt_i32_to_str (i32 value, char *buf, i32 bufSize);

-DQNT_FILE_SCOPE i32  dqnt_wstrcmp(const wchar_t *a, const wchar_t *b);
-DQNT_FILE_SCOPE void dqnt_wstrcat(const wchar_t *a, i32 lenA, const wchar_t *b, i32 lenB, wchar_t *out, i32 outLen);
-DQNT_FILE_SCOPE i32  dqnt_wstrlen(const wchar_t *a);
+// Both return the number of bytes read, return 0 if invalid codepoint or UTF8
+DQNT_FILE_SCOPE u32 dqnt_ucs_to_utf8(u32 *dest, u32 character);
+DQNT_FILE_SCOPE u32 dqnt_utf8_to_ucs(u32 *dest, u32 character);
+////////////////////////////////////////////////////////////////////////////////
+// File Operations
+////////////////////////////////////////////////////////////////////////////////
+typedef struct DqntFile
+{
+	void *handle;
+	u64   size;
+} DqntFile;
+
+#if 0
+bool platform_open_file (char *const file, PlatformFile *platformFile);
+// Return the number of bytes read
+u32  platform_read_file (PlatformFile file, void *buffer, u32 numBytesToRead);
+void platform_close_file(PlatformFile *file);
+#endif

 ////////////////////////////////////////////////////////////////////////////////
 // Timer
@ -728,7 +744,7 @@ DQNT_FILE_SCOPE inline bool dqnt_rect_contains_p(DqntRect rect, DqntV2 p)
 }

 ////////////////////////////////////////////////////////////////////////////////
-// String Ops
+// String Operations
 ////////////////////////////////////////////////////////////////////////////////
 DQNT_FILE_SCOPE bool dqnt_char_is_digit(char c)
 {
@ -870,27 +886,166 @@ DQNT_FILE_SCOPE void dqnt_i32_to_str(i32 value, char *buf, i32 bufSize)
 	}
 }

-DQNT_FILE_SCOPE i32 dqnt_wstrcmp(const wchar_t *a, const wchar_t *b)
-{
-	if (!a && !b) return -1;
-	if (!a) return -1;
-	if (!b) return -1;
+/*
+	Encoding
+	The following byte sequences are used to represent a character. The sequence
+	to be used depends on the UCS code number of the character:

-	while ((*a) == (*b))
+	The extra 1's are the headers used to identify the string as a UTF-8 string.
+	UCS [0x00000000, 0x0000007F] -> UTF-8 0xxxxxxx
+	UCS [0x00000080, 0x000007FF] -> UTF-8 110xxxxx 10xxxxxx
+	UCS [0x00000800, 0x0000FFFF] -> UTF-8 1110xxxx 10xxxxxx 10xxxxxx
+	UCS [0x00010000, 0x001FFFFF] -> UTF-8 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+	UCS [0x00200000, 0x03FFFFFF] -> N/A   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+	UCS [0x04000000, 0x7FFFFFFF] -> N/A   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+	The xxx bit positions are filled with the bits of the character code number
+	in binary representation. Only the shortest possible multibyte sequence
+	which can represent the code number of the character can be used.
+
+	The UCS code values 0xd800–0xdfff (UTF-16 surrogates) as well as 0xfffe and
+	0xffff (UCS noncharacters) should not appear in conforming UTF-8 streams.
+*/
+DQNT_FILE_SCOPE u32 dqnt_ucs_to_utf8(u32 *dest, u32 character)
 {
-		if (!(*a)) return 0;
-		a++;
-		b++;
+	if (!dest) return 0;
+
+	u8 *bytePtr = (u8 *)dest;
+
+	// Character is within ASCII range, so it's an ascii character
+	// UTF Bit Arrangement: 0xxxxxxx
+	// Character          : 0xxxxxxx
+	if (character >= 0 && character < 0x80)
+	{
+		bytePtr[0] = (u8)character;
+		return 1;
 	}

-	return (((*a) < (*b)) ? -1 : 1);
+	// UTF Header Bits    : 11000000 00xxxxxx
+	// UTF Bit Arrangement: 000xxxxx 00xxxxxx
+	// Character          : 00000xxx xxxxxxxx
+	if (character < 0x800)
+	{
+		// Add the 2nd byte, 6 bits, OR the 0xC0 (11000000) header bits
+		bytePtr[1] = (u8)((character >> 6) | 0xC0);
+
+		// Add the 1st byte, 6 bits, plus the 0x80 (10000000) header bits
+		bytePtr[0] = (u8)((character & 0x3F) | 0x80);
+
+		return 2;
 	}

-DQNT_FILE_SCOPE i32 dqnt_wstrlen(const wchar_t *a)
+	// UTF Header Bits     : 11100000 10000000 10000000
+	// UTF Bit Arrangement : 0000xxxx 00xxxxxx 00xxxxxx
+	// Character           : 00000000 xxxxxxxx xxxxxxxx
+	if (character < 0x10000)
 	{
-	i32 result = 0;
-	while (a && a[result]) result++;
-	return result;
+		// Add the 3rd byte, 4 bits, OR the 0xE0 (11100000) header bits
+		bytePtr[2] = (u8)((character >> 12) | 0xE0);
+
+		// Add the 2nd byte, 6 bits, OR the 0x80 (10000000) header bits
+		bytePtr[1] = (u8)((character >> 6) | 0x80);
+
+		// Add the 1st byte, 6 bits, plus the 0x80 (10000000) header bits
+		bytePtr[0] = (u8)((character & 0x3F) | 0x80);
+
+		return 3;
+	}
+
+	// UTF Header Bits     : 11110000 10000000 10000000 10000000
+	// UTF Bit Arrangement : 00000xxx 00xxxxxx 00xxxxxx 00xxxxxx
+	// Character           : 00000000 00000xxx xxxxxxxx xxxxxxxx
+	if (character < 0x110000)
+	{
+		// Add the 4th byte, 3 bits, OR the 0xF0 (11110000) header bits
+		bytePtr[3] = (u8)((character >> 18) | 0xF0);
+
+		// Add the 3rd byte, 6 bits, OR the 0x80 (10000000) header bits
+		bytePtr[2] = (u8)(((character >> 12) & 0x3F) | 0x80);
+
+		// Add the 2nd byte, 6 bits, plus the 0x80 (10000000) header bits
+		bytePtr[1] = (u8)(((character >> 6) & 0x3F) | 0x80);
+
+		// Add the 2nd byte, 6 bits, plus the 0x80 (10000000) header bits
+		bytePtr[0] = (u8)((character & 0x3F) | 0x80);
+
+		return 4;
+	}
+
+	return 0;
+}
+
+DQNT_FILE_SCOPE u32 dqnt_utf8_to_ucs(u32 *dest, u32 character)
+{
+	if (!dest) return 0;
+
+	// UTF Header Bits     : 11110000 10000000 10000000 10000000
+	// UTF Bit Arrangement : 00000xxx 00xxxxxx 00xxxxxx 00xxxxxx
+	// UCS                 : 00000000 00000xxx xxxxxxxx xxxxxxxx
+	const u32 headerBits4Bytes = 0xF0808080;
+	if ((character & headerBits4Bytes) == headerBits4Bytes)
+	{
+		u32 utfWithoutHeader = headerBits4Bytes ^ character;
+
+		u32 firstByte  = utfWithoutHeader & 0x3F;
+		u32 secondByte = (utfWithoutHeader >> 8)  & 0x3F;
+		u32 thirdByte  = (utfWithoutHeader >> 16) & 0x3F;
+		u32 fourthByte = utfWithoutHeader >> 24;
+
+		u32 result =
+		    (fourthByte << 18 | thirdByte << 12 | secondByte << 6 | firstByte);
+		*dest = result;
+
+		return 4;
+	}
+
+	// UTF Header Bits     : 11100000 10000000 10000000
+	// UTF Bit Arrangement : 0000xxxx 00xxxxxx 00xxxxxx
+	// UCS                 : 00000000 xxxxxxxx xxxxxxxx
+	const u32 headerBits3Bytes = 0xE08080;
+	if ((character & headerBits3Bytes)  == headerBits3Bytes)
+	{
+		u32 utfWithoutHeader = headerBits3Bytes ^ character;
+
+		u32 firstByte  = utfWithoutHeader & 0x3F;
+		u32 secondByte = (utfWithoutHeader >> 8) & 0x3F;
+		u32 thirdByte  = utfWithoutHeader >> 16;
+
+		u32 result = (thirdByte << 12 | secondByte << 6 | firstByte);
+		*dest = result;
+
+		return 3;
+	}
+
+	// UTF Header Bits    : 11000000 00xxxxxx
+	// UTF Bit Arrangement: 000xxxxx 00xxxxxx
+	// UCS                : 00000xxx xxxxxxxx
+	const u32 headerBits2Bytes = 0xC000;
+	if ((character & headerBits2Bytes) == headerBits2Bytes)
+	{
+		u32 utfWithoutHeader = headerBits2Bytes ^ character;
+
+		u32 firstByte  = utfWithoutHeader & 0x3F;
+		u32 secondByte = utfWithoutHeader >> 8;
+
+		u32 result = (secondByte << 6 | firstByte);
+		*dest = result;
+
+		return 2;
+	}
+
+	// Character is within ASCII range, so it's an ascii character
+	// UTF Bit Arrangement: 0xxxxxxx
+	// UCS                : 0xxxxxxx
+	if (character >= 0x0 && character < 0x80)
+	{
+		u32 firstByte = (character & 0x3F);
+		*dest         = firstByte;
+
+		return 1;
+	}
+
+	return 0;
 }

 ////////////////////////////////////////////////////////////////////////////////
--- a/dqnt_unit_test.cpp
+++ b/dqnt_unit_test.cpp
@ -208,49 +208,74 @@ void dqnt_strings_test()
 		}
 	}

-	// Wide String Checks
+    // UCS <-> UTF8 Checks
    {
-		// wstrcmp
+	    // Test ascii characters
 	    {
-			wchar_t *a = L"str_a";
+		    u32 codepoint = '@';
+		    u32 string[1] = {};

-			// Check simple compares
-			{
-				DQNT_ASSERT(dqnt_wstrcmp(a, L"str_a") == +0);
-				DQNT_ASSERT(dqnt_wstrcmp(a, L"str_b") == -1);
-				DQNT_ASSERT(dqnt_wstrcmp(L"str_b", a) == +1);
-				DQNT_ASSERT(dqnt_wstrcmp(a, L"") == +1);
-				DQNT_ASSERT(dqnt_wstrcmp(L"", L"") == 0);
+		    u32 bytesUsed = dqnt_ucs_to_utf8(&string[0], codepoint);
+		    DQNT_ASSERT(bytesUsed == 1);
+		    DQNT_ASSERT(string[0] == '@');

-				// NOTE: Check that the string has not been trashed.
-				DQNT_ASSERT(dqnt_wstrcmp(a, L"str_a") == +0);
+		    bytesUsed = dqnt_utf8_to_ucs(&string[0], codepoint);
+		    DQNT_ASSERT(string[0] >= 0 && string[0] < 0x80);
+		    DQNT_ASSERT(bytesUsed == 1);
 	    }

-			// Check ops against null
+	    // Test 2 byte characters
 		{
-				DQNT_ASSERT(dqnt_wstrcmp(NULL, NULL) != +0);
-				DQNT_ASSERT(dqnt_wstrcmp(a, NULL) != +0);
-				DQNT_ASSERT(dqnt_wstrcmp(NULL, a) != +0);
+		    u32 codepoint = 0x278;
+		    u32 string[1] = {};
+
+		    u32 bytesUsed = dqnt_ucs_to_utf8(&string[0], codepoint);
+		    DQNT_ASSERT(bytesUsed == 2);
+		    DQNT_ASSERT(string[0] == 0xC9B8);
+
+		    bytesUsed = dqnt_utf8_to_ucs(&string[0], string[0]);
+		    DQNT_ASSERT(string[0] == codepoint);
+		    DQNT_ASSERT(bytesUsed == 2);
 	    }

-			printf("dqnt_strings_test(): wstrcmp: Completed successfully\n");
-		}
-
-	    // wstrlen
+	    // Test 3 byte characters
 		{
-		    wchar_t *a = L"str_a";
-		    DQNT_ASSERT(dqnt_wstrlen(a) == 5);
-		    DQNT_ASSERT(dqnt_wstrlen(L"") == 0);
-		    DQNT_ASSERT(dqnt_wstrlen(L"   a  ") == 6);
-		    DQNT_ASSERT(dqnt_wstrlen(L"a\n") == 2);
+		    u32 codepoint = 0x0A0A;
+		    u32 string[1] = {};

-		    // NOTE: Check that the string has not been trashed.
-		    DQNT_ASSERT(dqnt_wstrcmp(a, L"str_a") == 0);
+			u32 bytesUsed = dqnt_ucs_to_utf8(&string[0], codepoint);
+		    DQNT_ASSERT(bytesUsed == 3);
+			DQNT_ASSERT(string[0] == 0xE0A88A);

-		    DQNT_ASSERT(dqnt_wstrlen(NULL) == 0);
-
-		    printf("dqnt_strings_test(): wstrlen: Completed successfully\n");
+		    bytesUsed = dqnt_utf8_to_ucs(&string[0], string[0]);
+		    DQNT_ASSERT(string[0] == codepoint);
+		    DQNT_ASSERT(bytesUsed == 3);
 	    }
+
+	    // Test 4 byte characters
+		{
+		    u32 codepoint = 0x10912;
+		    u32 string[1] = {};
+			u32 bytesUsed = dqnt_ucs_to_utf8(&string[0], codepoint);
+
+		    DQNT_ASSERT(bytesUsed == 4);
+		    DQNT_ASSERT(string[0] == 0xF090A492);
+
+		    bytesUsed = dqnt_utf8_to_ucs(&string[0], string[0]);
+		    DQNT_ASSERT(string[0] == codepoint);
+		    DQNT_ASSERT(bytesUsed == 4);
+	    }
+
+		{
+		    u32 codepoint = 0x10912;
+			u32 bytesUsed = dqnt_ucs_to_utf8(NULL, codepoint);
+		    DQNT_ASSERT(bytesUsed == 0);
+
+		    bytesUsed = dqnt_utf8_to_ucs(NULL, codepoint);
+		    DQNT_ASSERT(bytesUsed == 0);
+	    }
+
+	    printf("dqnt_strings_test(): ucs <-> utf8: Completed successfully\n");
    }

    printf("dqnt_strings_test(): Completed successfully\n");