diff --git a/dqnt.h b/dqnt.h index 8959819..5bbc0dd 100644 --- a/dqnt.h +++ b/dqnt.h @@ -162,6 +162,7 @@ DQNT_FILE_SCOPE DqntV2 dqnt_rect_get_size_v2(DqntRect rect); DQNT_FILE_SCOPE DqntV2 dqnt_rect_get_centre (DqntRect rect); DQNT_FILE_SCOPE DqntRect dqnt_rect_move (DqntRect rect, DqntV2 shift); DQNT_FILE_SCOPE bool dqnt_rect_contains_p (DqntRect rect, DqntV2 p); + //////////////////////////////////////////////////////////////////////////////// // String Ops //////////////////////////////////////////////////////////////////////////////// @@ -179,9 +180,24 @@ DQNT_FILE_SCOPE bool dqnt_str_reverse(char *buf, const i32 bufSize); DQNT_FILE_SCOPE i32 dqnt_str_to_i32 (char *const buf, const i32 bufSize); DQNT_FILE_SCOPE void dqnt_i32_to_str (i32 value, char *buf, i32 bufSize); -DQNT_FILE_SCOPE i32 dqnt_wstrcmp(const wchar_t *a, const wchar_t *b); -DQNT_FILE_SCOPE void dqnt_wstrcat(const wchar_t *a, i32 lenA, const wchar_t *b, i32 lenB, wchar_t *out, i32 outLen); -DQNT_FILE_SCOPE i32 dqnt_wstrlen(const wchar_t *a); +// Both return the number of bytes read, return 0 if invalid codepoint or UTF8 +DQNT_FILE_SCOPE u32 dqnt_ucs_to_utf8(u32 *dest, u32 character); +DQNT_FILE_SCOPE u32 dqnt_utf8_to_ucs(u32 *dest, u32 character); +//////////////////////////////////////////////////////////////////////////////// +// File Operations +//////////////////////////////////////////////////////////////////////////////// +typedef struct DqntFile +{ + void *handle; + u64 size; +} DqntFile; + +#if 0 +bool platform_open_file (char *const file, PlatformFile *platformFile); +// Return the number of bytes read +u32 platform_read_file (PlatformFile file, void *buffer, u32 numBytesToRead); +void platform_close_file(PlatformFile *file); +#endif //////////////////////////////////////////////////////////////////////////////// // Timer @@ -728,7 +744,7 @@ DQNT_FILE_SCOPE inline bool dqnt_rect_contains_p(DqntRect rect, DqntV2 p) } //////////////////////////////////////////////////////////////////////////////// -// String Ops +// String Operations //////////////////////////////////////////////////////////////////////////////// DQNT_FILE_SCOPE bool dqnt_char_is_digit(char c) { @@ -870,27 +886,166 @@ DQNT_FILE_SCOPE void dqnt_i32_to_str(i32 value, char *buf, i32 bufSize) } } -DQNT_FILE_SCOPE i32 dqnt_wstrcmp(const wchar_t *a, const wchar_t *b) -{ - if (!a && !b) return -1; - if (!a) return -1; - if (!b) return -1; +/* + Encoding + The following byte sequences are used to represent a character. The sequence + to be used depends on the UCS code number of the character: - while ((*a) == (*b)) + The extra 1's are the headers used to identify the string as a UTF-8 string. + UCS [0x00000000, 0x0000007F] -> UTF-8 0xxxxxxx + UCS [0x00000080, 0x000007FF] -> UTF-8 110xxxxx 10xxxxxx + UCS [0x00000800, 0x0000FFFF] -> UTF-8 1110xxxx 10xxxxxx 10xxxxxx + UCS [0x00010000, 0x001FFFFF] -> UTF-8 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + UCS [0x00200000, 0x03FFFFFF] -> N/A 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + UCS [0x04000000, 0x7FFFFFFF] -> N/A 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + + The xxx bit positions are filled with the bits of the character code number + in binary representation. Only the shortest possible multibyte sequence + which can represent the code number of the character can be used. + + The UCS code values 0xd800–0xdfff (UTF-16 surrogates) as well as 0xfffe and + 0xffff (UCS noncharacters) should not appear in conforming UTF-8 streams. +*/ +DQNT_FILE_SCOPE u32 dqnt_ucs_to_utf8(u32 *dest, u32 character) +{ + if (!dest) return 0; + + u8 *bytePtr = (u8 *)dest; + + // Character is within ASCII range, so it's an ascii character + // UTF Bit Arrangement: 0xxxxxxx + // Character : 0xxxxxxx + if (character >= 0 && character < 0x80) { - if (!(*a)) return 0; - a++; - b++; + bytePtr[0] = (u8)character; + return 1; } - return (((*a) < (*b)) ? -1 : 1); + // UTF Header Bits : 11000000 00xxxxxx + // UTF Bit Arrangement: 000xxxxx 00xxxxxx + // Character : 00000xxx xxxxxxxx + if (character < 0x800) + { + // Add the 2nd byte, 6 bits, OR the 0xC0 (11000000) header bits + bytePtr[1] = (u8)((character >> 6) | 0xC0); + + // Add the 1st byte, 6 bits, plus the 0x80 (10000000) header bits + bytePtr[0] = (u8)((character & 0x3F) | 0x80); + + return 2; + } + + // UTF Header Bits : 11100000 10000000 10000000 + // UTF Bit Arrangement : 0000xxxx 00xxxxxx 00xxxxxx + // Character : 00000000 xxxxxxxx xxxxxxxx + if (character < 0x10000) + { + // Add the 3rd byte, 4 bits, OR the 0xE0 (11100000) header bits + bytePtr[2] = (u8)((character >> 12) | 0xE0); + + // Add the 2nd byte, 6 bits, OR the 0x80 (10000000) header bits + bytePtr[1] = (u8)((character >> 6) | 0x80); + + // Add the 1st byte, 6 bits, plus the 0x80 (10000000) header bits + bytePtr[0] = (u8)((character & 0x3F) | 0x80); + + return 3; + } + + // UTF Header Bits : 11110000 10000000 10000000 10000000 + // UTF Bit Arrangement : 00000xxx 00xxxxxx 00xxxxxx 00xxxxxx + // Character : 00000000 00000xxx xxxxxxxx xxxxxxxx + if (character < 0x110000) + { + // Add the 4th byte, 3 bits, OR the 0xF0 (11110000) header bits + bytePtr[3] = (u8)((character >> 18) | 0xF0); + + // Add the 3rd byte, 6 bits, OR the 0x80 (10000000) header bits + bytePtr[2] = (u8)(((character >> 12) & 0x3F) | 0x80); + + // Add the 2nd byte, 6 bits, plus the 0x80 (10000000) header bits + bytePtr[1] = (u8)(((character >> 6) & 0x3F) | 0x80); + + // Add the 2nd byte, 6 bits, plus the 0x80 (10000000) header bits + bytePtr[0] = (u8)((character & 0x3F) | 0x80); + + return 4; + } + + return 0; } -DQNT_FILE_SCOPE i32 dqnt_wstrlen(const wchar_t *a) +DQNT_FILE_SCOPE u32 dqnt_utf8_to_ucs(u32 *dest, u32 character) { - i32 result = 0; - while (a && a[result]) result++; - return result; + if (!dest) return 0; + + // UTF Header Bits : 11110000 10000000 10000000 10000000 + // UTF Bit Arrangement : 00000xxx 00xxxxxx 00xxxxxx 00xxxxxx + // UCS : 00000000 00000xxx xxxxxxxx xxxxxxxx + const u32 headerBits4Bytes = 0xF0808080; + if ((character & headerBits4Bytes) == headerBits4Bytes) + { + u32 utfWithoutHeader = headerBits4Bytes ^ character; + + u32 firstByte = utfWithoutHeader & 0x3F; + u32 secondByte = (utfWithoutHeader >> 8) & 0x3F; + u32 thirdByte = (utfWithoutHeader >> 16) & 0x3F; + u32 fourthByte = utfWithoutHeader >> 24; + + u32 result = + (fourthByte << 18 | thirdByte << 12 | secondByte << 6 | firstByte); + *dest = result; + + return 4; + } + + // UTF Header Bits : 11100000 10000000 10000000 + // UTF Bit Arrangement : 0000xxxx 00xxxxxx 00xxxxxx + // UCS : 00000000 xxxxxxxx xxxxxxxx + const u32 headerBits3Bytes = 0xE08080; + if ((character & headerBits3Bytes) == headerBits3Bytes) + { + u32 utfWithoutHeader = headerBits3Bytes ^ character; + + u32 firstByte = utfWithoutHeader & 0x3F; + u32 secondByte = (utfWithoutHeader >> 8) & 0x3F; + u32 thirdByte = utfWithoutHeader >> 16; + + u32 result = (thirdByte << 12 | secondByte << 6 | firstByte); + *dest = result; + + return 3; + } + + // UTF Header Bits : 11000000 00xxxxxx + // UTF Bit Arrangement: 000xxxxx 00xxxxxx + // UCS : 00000xxx xxxxxxxx + const u32 headerBits2Bytes = 0xC000; + if ((character & headerBits2Bytes) == headerBits2Bytes) + { + u32 utfWithoutHeader = headerBits2Bytes ^ character; + + u32 firstByte = utfWithoutHeader & 0x3F; + u32 secondByte = utfWithoutHeader >> 8; + + u32 result = (secondByte << 6 | firstByte); + *dest = result; + + return 2; + } + + // Character is within ASCII range, so it's an ascii character + // UTF Bit Arrangement: 0xxxxxxx + // UCS : 0xxxxxxx + if (character >= 0x0 && character < 0x80) + { + u32 firstByte = (character & 0x3F); + *dest = firstByte; + + return 1; + } + + return 0; } //////////////////////////////////////////////////////////////////////////////// diff --git a/dqnt_unit_test.cpp b/dqnt_unit_test.cpp index 2e815f7..577f232 100644 --- a/dqnt_unit_test.cpp +++ b/dqnt_unit_test.cpp @@ -206,54 +206,79 @@ void dqnt_strings_test() printf("dqnt_strings_test(): str_to_i32: Completed successfully\n"); } - } + } - // Wide String Checks - { - // wstrcmp - { - wchar_t *a = L"str_a"; - - // Check simple compares - { - DQNT_ASSERT(dqnt_wstrcmp(a, L"str_a") == +0); - DQNT_ASSERT(dqnt_wstrcmp(a, L"str_b") == -1); - DQNT_ASSERT(dqnt_wstrcmp(L"str_b", a) == +1); - DQNT_ASSERT(dqnt_wstrcmp(a, L"") == +1); - DQNT_ASSERT(dqnt_wstrcmp(L"", L"") == 0); - - // NOTE: Check that the string has not been trashed. - DQNT_ASSERT(dqnt_wstrcmp(a, L"str_a") == +0); - } - - // Check ops against null - { - DQNT_ASSERT(dqnt_wstrcmp(NULL, NULL) != +0); - DQNT_ASSERT(dqnt_wstrcmp(a, NULL) != +0); - DQNT_ASSERT(dqnt_wstrcmp(NULL, a) != +0); - } - - printf("dqnt_strings_test(): wstrcmp: Completed successfully\n"); - } - - // wstrlen + // UCS <-> UTF8 Checks + { + // Test ascii characters { - wchar_t *a = L"str_a"; - DQNT_ASSERT(dqnt_wstrlen(a) == 5); - DQNT_ASSERT(dqnt_wstrlen(L"") == 0); - DQNT_ASSERT(dqnt_wstrlen(L" a ") == 6); - DQNT_ASSERT(dqnt_wstrlen(L"a\n") == 2); + u32 codepoint = '@'; + u32 string[1] = {}; - // NOTE: Check that the string has not been trashed. - DQNT_ASSERT(dqnt_wstrcmp(a, L"str_a") == 0); + u32 bytesUsed = dqnt_ucs_to_utf8(&string[0], codepoint); + DQNT_ASSERT(bytesUsed == 1); + DQNT_ASSERT(string[0] == '@'); - DQNT_ASSERT(dqnt_wstrlen(NULL) == 0); - - printf("dqnt_strings_test(): wstrlen: Completed successfully\n"); + bytesUsed = dqnt_utf8_to_ucs(&string[0], codepoint); + DQNT_ASSERT(string[0] >= 0 && string[0] < 0x80); + DQNT_ASSERT(bytesUsed == 1); } + + // Test 2 byte characters + { + u32 codepoint = 0x278; + u32 string[1] = {}; + + u32 bytesUsed = dqnt_ucs_to_utf8(&string[0], codepoint); + DQNT_ASSERT(bytesUsed == 2); + DQNT_ASSERT(string[0] == 0xC9B8); + + bytesUsed = dqnt_utf8_to_ucs(&string[0], string[0]); + DQNT_ASSERT(string[0] == codepoint); + DQNT_ASSERT(bytesUsed == 2); + } + + // Test 3 byte characters + { + u32 codepoint = 0x0A0A; + u32 string[1] = {}; + + u32 bytesUsed = dqnt_ucs_to_utf8(&string[0], codepoint); + DQNT_ASSERT(bytesUsed == 3); + DQNT_ASSERT(string[0] == 0xE0A88A); + + bytesUsed = dqnt_utf8_to_ucs(&string[0], string[0]); + DQNT_ASSERT(string[0] == codepoint); + DQNT_ASSERT(bytesUsed == 3); + } + + // Test 4 byte characters + { + u32 codepoint = 0x10912; + u32 string[1] = {}; + u32 bytesUsed = dqnt_ucs_to_utf8(&string[0], codepoint); + + DQNT_ASSERT(bytesUsed == 4); + DQNT_ASSERT(string[0] == 0xF090A492); + + bytesUsed = dqnt_utf8_to_ucs(&string[0], string[0]); + DQNT_ASSERT(string[0] == codepoint); + DQNT_ASSERT(bytesUsed == 4); + } + + { + u32 codepoint = 0x10912; + u32 bytesUsed = dqnt_ucs_to_utf8(NULL, codepoint); + DQNT_ASSERT(bytesUsed == 0); + + bytesUsed = dqnt_utf8_to_ucs(NULL, codepoint); + DQNT_ASSERT(bytesUsed == 0); + } + + printf("dqnt_strings_test(): ucs <-> utf8: Completed successfully\n"); } - printf("dqnt_strings_test(): Completed successfully\n"); + printf("dqnt_strings_test(): Completed successfully\n"); } #include "Windows.h"