Add ucs to utf8 conversion to and from
This commit is contained in:
parent
30e219e7c3
commit
3e0064d741
191
dqnt.h
191
dqnt.h
@ -162,6 +162,7 @@ DQNT_FILE_SCOPE DqntV2 dqnt_rect_get_size_v2(DqntRect rect);
|
||||
DQNT_FILE_SCOPE DqntV2 dqnt_rect_get_centre (DqntRect rect);
|
||||
DQNT_FILE_SCOPE DqntRect dqnt_rect_move (DqntRect rect, DqntV2 shift);
|
||||
DQNT_FILE_SCOPE bool dqnt_rect_contains_p (DqntRect rect, DqntV2 p);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// String Ops
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
@ -179,9 +180,24 @@ DQNT_FILE_SCOPE bool dqnt_str_reverse(char *buf, const i32 bufSize);
|
||||
DQNT_FILE_SCOPE i32 dqnt_str_to_i32 (char *const buf, const i32 bufSize);
|
||||
DQNT_FILE_SCOPE void dqnt_i32_to_str (i32 value, char *buf, i32 bufSize);
|
||||
|
||||
DQNT_FILE_SCOPE i32 dqnt_wstrcmp(const wchar_t *a, const wchar_t *b);
|
||||
DQNT_FILE_SCOPE void dqnt_wstrcat(const wchar_t *a, i32 lenA, const wchar_t *b, i32 lenB, wchar_t *out, i32 outLen);
|
||||
DQNT_FILE_SCOPE i32 dqnt_wstrlen(const wchar_t *a);
|
||||
// Both return the number of bytes read, return 0 if invalid codepoint or UTF8
|
||||
DQNT_FILE_SCOPE u32 dqnt_ucs_to_utf8(u32 *dest, u32 character);
|
||||
DQNT_FILE_SCOPE u32 dqnt_utf8_to_ucs(u32 *dest, u32 character);
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// File Operations
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
typedef struct DqntFile
|
||||
{
|
||||
void *handle;
|
||||
u64 size;
|
||||
} DqntFile;
|
||||
|
||||
#if 0
|
||||
bool platform_open_file (char *const file, PlatformFile *platformFile);
|
||||
// Return the number of bytes read
|
||||
u32 platform_read_file (PlatformFile file, void *buffer, u32 numBytesToRead);
|
||||
void platform_close_file(PlatformFile *file);
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Timer
|
||||
@ -728,7 +744,7 @@ DQNT_FILE_SCOPE inline bool dqnt_rect_contains_p(DqntRect rect, DqntV2 p)
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// String Ops
|
||||
// String Operations
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
DQNT_FILE_SCOPE bool dqnt_char_is_digit(char c)
|
||||
{
|
||||
@ -870,27 +886,166 @@ DQNT_FILE_SCOPE void dqnt_i32_to_str(i32 value, char *buf, i32 bufSize)
|
||||
}
|
||||
}
|
||||
|
||||
DQNT_FILE_SCOPE i32 dqnt_wstrcmp(const wchar_t *a, const wchar_t *b)
|
||||
{
|
||||
if (!a && !b) return -1;
|
||||
if (!a) return -1;
|
||||
if (!b) return -1;
|
||||
/*
|
||||
Encoding
|
||||
The following byte sequences are used to represent a character. The sequence
|
||||
to be used depends on the UCS code number of the character:
|
||||
|
||||
while ((*a) == (*b))
|
||||
The extra 1's are the headers used to identify the string as a UTF-8 string.
|
||||
UCS [0x00000000, 0x0000007F] -> UTF-8 0xxxxxxx
|
||||
UCS [0x00000080, 0x000007FF] -> UTF-8 110xxxxx 10xxxxxx
|
||||
UCS [0x00000800, 0x0000FFFF] -> UTF-8 1110xxxx 10xxxxxx 10xxxxxx
|
||||
UCS [0x00010000, 0x001FFFFF] -> UTF-8 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
UCS [0x00200000, 0x03FFFFFF] -> N/A 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
UCS [0x04000000, 0x7FFFFFFF] -> N/A 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
|
||||
The xxx bit positions are filled with the bits of the character code number
|
||||
in binary representation. Only the shortest possible multibyte sequence
|
||||
which can represent the code number of the character can be used.
|
||||
|
||||
The UCS code values 0xd800–0xdfff (UTF-16 surrogates) as well as 0xfffe and
|
||||
0xffff (UCS noncharacters) should not appear in conforming UTF-8 streams.
|
||||
*/
|
||||
DQNT_FILE_SCOPE u32 dqnt_ucs_to_utf8(u32 *dest, u32 character)
|
||||
{
|
||||
if (!dest) return 0;
|
||||
|
||||
u8 *bytePtr = (u8 *)dest;
|
||||
|
||||
// Character is within ASCII range, so it's an ascii character
|
||||
// UTF Bit Arrangement: 0xxxxxxx
|
||||
// Character : 0xxxxxxx
|
||||
if (character >= 0 && character < 0x80)
|
||||
{
|
||||
if (!(*a)) return 0;
|
||||
a++;
|
||||
b++;
|
||||
bytePtr[0] = (u8)character;
|
||||
return 1;
|
||||
}
|
||||
|
||||
return (((*a) < (*b)) ? -1 : 1);
|
||||
// UTF Header Bits : 11000000 00xxxxxx
|
||||
// UTF Bit Arrangement: 000xxxxx 00xxxxxx
|
||||
// Character : 00000xxx xxxxxxxx
|
||||
if (character < 0x800)
|
||||
{
|
||||
// Add the 2nd byte, 6 bits, OR the 0xC0 (11000000) header bits
|
||||
bytePtr[1] = (u8)((character >> 6) | 0xC0);
|
||||
|
||||
// Add the 1st byte, 6 bits, plus the 0x80 (10000000) header bits
|
||||
bytePtr[0] = (u8)((character & 0x3F) | 0x80);
|
||||
|
||||
return 2;
|
||||
}
|
||||
|
||||
// UTF Header Bits : 11100000 10000000 10000000
|
||||
// UTF Bit Arrangement : 0000xxxx 00xxxxxx 00xxxxxx
|
||||
// Character : 00000000 xxxxxxxx xxxxxxxx
|
||||
if (character < 0x10000)
|
||||
{
|
||||
// Add the 3rd byte, 4 bits, OR the 0xE0 (11100000) header bits
|
||||
bytePtr[2] = (u8)((character >> 12) | 0xE0);
|
||||
|
||||
// Add the 2nd byte, 6 bits, OR the 0x80 (10000000) header bits
|
||||
bytePtr[1] = (u8)((character >> 6) | 0x80);
|
||||
|
||||
// Add the 1st byte, 6 bits, plus the 0x80 (10000000) header bits
|
||||
bytePtr[0] = (u8)((character & 0x3F) | 0x80);
|
||||
|
||||
return 3;
|
||||
}
|
||||
|
||||
// UTF Header Bits : 11110000 10000000 10000000 10000000
|
||||
// UTF Bit Arrangement : 00000xxx 00xxxxxx 00xxxxxx 00xxxxxx
|
||||
// Character : 00000000 00000xxx xxxxxxxx xxxxxxxx
|
||||
if (character < 0x110000)
|
||||
{
|
||||
// Add the 4th byte, 3 bits, OR the 0xF0 (11110000) header bits
|
||||
bytePtr[3] = (u8)((character >> 18) | 0xF0);
|
||||
|
||||
// Add the 3rd byte, 6 bits, OR the 0x80 (10000000) header bits
|
||||
bytePtr[2] = (u8)(((character >> 12) & 0x3F) | 0x80);
|
||||
|
||||
// Add the 2nd byte, 6 bits, plus the 0x80 (10000000) header bits
|
||||
bytePtr[1] = (u8)(((character >> 6) & 0x3F) | 0x80);
|
||||
|
||||
// Add the 2nd byte, 6 bits, plus the 0x80 (10000000) header bits
|
||||
bytePtr[0] = (u8)((character & 0x3F) | 0x80);
|
||||
|
||||
return 4;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
DQNT_FILE_SCOPE i32 dqnt_wstrlen(const wchar_t *a)
|
||||
DQNT_FILE_SCOPE u32 dqnt_utf8_to_ucs(u32 *dest, u32 character)
|
||||
{
|
||||
i32 result = 0;
|
||||
while (a && a[result]) result++;
|
||||
return result;
|
||||
if (!dest) return 0;
|
||||
|
||||
// UTF Header Bits : 11110000 10000000 10000000 10000000
|
||||
// UTF Bit Arrangement : 00000xxx 00xxxxxx 00xxxxxx 00xxxxxx
|
||||
// UCS : 00000000 00000xxx xxxxxxxx xxxxxxxx
|
||||
const u32 headerBits4Bytes = 0xF0808080;
|
||||
if ((character & headerBits4Bytes) == headerBits4Bytes)
|
||||
{
|
||||
u32 utfWithoutHeader = headerBits4Bytes ^ character;
|
||||
|
||||
u32 firstByte = utfWithoutHeader & 0x3F;
|
||||
u32 secondByte = (utfWithoutHeader >> 8) & 0x3F;
|
||||
u32 thirdByte = (utfWithoutHeader >> 16) & 0x3F;
|
||||
u32 fourthByte = utfWithoutHeader >> 24;
|
||||
|
||||
u32 result =
|
||||
(fourthByte << 18 | thirdByte << 12 | secondByte << 6 | firstByte);
|
||||
*dest = result;
|
||||
|
||||
return 4;
|
||||
}
|
||||
|
||||
// UTF Header Bits : 11100000 10000000 10000000
|
||||
// UTF Bit Arrangement : 0000xxxx 00xxxxxx 00xxxxxx
|
||||
// UCS : 00000000 xxxxxxxx xxxxxxxx
|
||||
const u32 headerBits3Bytes = 0xE08080;
|
||||
if ((character & headerBits3Bytes) == headerBits3Bytes)
|
||||
{
|
||||
u32 utfWithoutHeader = headerBits3Bytes ^ character;
|
||||
|
||||
u32 firstByte = utfWithoutHeader & 0x3F;
|
||||
u32 secondByte = (utfWithoutHeader >> 8) & 0x3F;
|
||||
u32 thirdByte = utfWithoutHeader >> 16;
|
||||
|
||||
u32 result = (thirdByte << 12 | secondByte << 6 | firstByte);
|
||||
*dest = result;
|
||||
|
||||
return 3;
|
||||
}
|
||||
|
||||
// UTF Header Bits : 11000000 00xxxxxx
|
||||
// UTF Bit Arrangement: 000xxxxx 00xxxxxx
|
||||
// UCS : 00000xxx xxxxxxxx
|
||||
const u32 headerBits2Bytes = 0xC000;
|
||||
if ((character & headerBits2Bytes) == headerBits2Bytes)
|
||||
{
|
||||
u32 utfWithoutHeader = headerBits2Bytes ^ character;
|
||||
|
||||
u32 firstByte = utfWithoutHeader & 0x3F;
|
||||
u32 secondByte = utfWithoutHeader >> 8;
|
||||
|
||||
u32 result = (secondByte << 6 | firstByte);
|
||||
*dest = result;
|
||||
|
||||
return 2;
|
||||
}
|
||||
|
||||
// Character is within ASCII range, so it's an ascii character
|
||||
// UTF Bit Arrangement: 0xxxxxxx
|
||||
// UCS : 0xxxxxxx
|
||||
if (character >= 0x0 && character < 0x80)
|
||||
{
|
||||
u32 firstByte = (character & 0x3F);
|
||||
*dest = firstByte;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -206,54 +206,79 @@ void dqnt_strings_test()
|
||||
|
||||
printf("dqnt_strings_test(): str_to_i32: Completed successfully\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Wide String Checks
|
||||
{
|
||||
// wstrcmp
|
||||
{
|
||||
wchar_t *a = L"str_a";
|
||||
|
||||
// Check simple compares
|
||||
{
|
||||
DQNT_ASSERT(dqnt_wstrcmp(a, L"str_a") == +0);
|
||||
DQNT_ASSERT(dqnt_wstrcmp(a, L"str_b") == -1);
|
||||
DQNT_ASSERT(dqnt_wstrcmp(L"str_b", a) == +1);
|
||||
DQNT_ASSERT(dqnt_wstrcmp(a, L"") == +1);
|
||||
DQNT_ASSERT(dqnt_wstrcmp(L"", L"") == 0);
|
||||
|
||||
// NOTE: Check that the string has not been trashed.
|
||||
DQNT_ASSERT(dqnt_wstrcmp(a, L"str_a") == +0);
|
||||
}
|
||||
|
||||
// Check ops against null
|
||||
{
|
||||
DQNT_ASSERT(dqnt_wstrcmp(NULL, NULL) != +0);
|
||||
DQNT_ASSERT(dqnt_wstrcmp(a, NULL) != +0);
|
||||
DQNT_ASSERT(dqnt_wstrcmp(NULL, a) != +0);
|
||||
}
|
||||
|
||||
printf("dqnt_strings_test(): wstrcmp: Completed successfully\n");
|
||||
}
|
||||
|
||||
// wstrlen
|
||||
// UCS <-> UTF8 Checks
|
||||
{
|
||||
// Test ascii characters
|
||||
{
|
||||
wchar_t *a = L"str_a";
|
||||
DQNT_ASSERT(dqnt_wstrlen(a) == 5);
|
||||
DQNT_ASSERT(dqnt_wstrlen(L"") == 0);
|
||||
DQNT_ASSERT(dqnt_wstrlen(L" a ") == 6);
|
||||
DQNT_ASSERT(dqnt_wstrlen(L"a\n") == 2);
|
||||
u32 codepoint = '@';
|
||||
u32 string[1] = {};
|
||||
|
||||
// NOTE: Check that the string has not been trashed.
|
||||
DQNT_ASSERT(dqnt_wstrcmp(a, L"str_a") == 0);
|
||||
u32 bytesUsed = dqnt_ucs_to_utf8(&string[0], codepoint);
|
||||
DQNT_ASSERT(bytesUsed == 1);
|
||||
DQNT_ASSERT(string[0] == '@');
|
||||
|
||||
DQNT_ASSERT(dqnt_wstrlen(NULL) == 0);
|
||||
|
||||
printf("dqnt_strings_test(): wstrlen: Completed successfully\n");
|
||||
bytesUsed = dqnt_utf8_to_ucs(&string[0], codepoint);
|
||||
DQNT_ASSERT(string[0] >= 0 && string[0] < 0x80);
|
||||
DQNT_ASSERT(bytesUsed == 1);
|
||||
}
|
||||
|
||||
// Test 2 byte characters
|
||||
{
|
||||
u32 codepoint = 0x278;
|
||||
u32 string[1] = {};
|
||||
|
||||
u32 bytesUsed = dqnt_ucs_to_utf8(&string[0], codepoint);
|
||||
DQNT_ASSERT(bytesUsed == 2);
|
||||
DQNT_ASSERT(string[0] == 0xC9B8);
|
||||
|
||||
bytesUsed = dqnt_utf8_to_ucs(&string[0], string[0]);
|
||||
DQNT_ASSERT(string[0] == codepoint);
|
||||
DQNT_ASSERT(bytesUsed == 2);
|
||||
}
|
||||
|
||||
// Test 3 byte characters
|
||||
{
|
||||
u32 codepoint = 0x0A0A;
|
||||
u32 string[1] = {};
|
||||
|
||||
u32 bytesUsed = dqnt_ucs_to_utf8(&string[0], codepoint);
|
||||
DQNT_ASSERT(bytesUsed == 3);
|
||||
DQNT_ASSERT(string[0] == 0xE0A88A);
|
||||
|
||||
bytesUsed = dqnt_utf8_to_ucs(&string[0], string[0]);
|
||||
DQNT_ASSERT(string[0] == codepoint);
|
||||
DQNT_ASSERT(bytesUsed == 3);
|
||||
}
|
||||
|
||||
// Test 4 byte characters
|
||||
{
|
||||
u32 codepoint = 0x10912;
|
||||
u32 string[1] = {};
|
||||
u32 bytesUsed = dqnt_ucs_to_utf8(&string[0], codepoint);
|
||||
|
||||
DQNT_ASSERT(bytesUsed == 4);
|
||||
DQNT_ASSERT(string[0] == 0xF090A492);
|
||||
|
||||
bytesUsed = dqnt_utf8_to_ucs(&string[0], string[0]);
|
||||
DQNT_ASSERT(string[0] == codepoint);
|
||||
DQNT_ASSERT(bytesUsed == 4);
|
||||
}
|
||||
|
||||
{
|
||||
u32 codepoint = 0x10912;
|
||||
u32 bytesUsed = dqnt_ucs_to_utf8(NULL, codepoint);
|
||||
DQNT_ASSERT(bytesUsed == 0);
|
||||
|
||||
bytesUsed = dqnt_utf8_to_ucs(NULL, codepoint);
|
||||
DQNT_ASSERT(bytesUsed == 0);
|
||||
}
|
||||
|
||||
printf("dqnt_strings_test(): ucs <-> utf8: Completed successfully\n");
|
||||
}
|
||||
|
||||
printf("dqnt_strings_test(): Completed successfully\n");
|
||||
printf("dqnt_strings_test(): Completed successfully\n");
|
||||
}
|
||||
|
||||
#include "Windows.h"
|
||||
|
Loading…
Reference in New Issue
Block a user