289 lines
		
	
	
		
			9.7 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			289 lines
		
	
	
		
			9.7 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| #include "dn_csv.h"
 | |
| 
 | |
| static DN_CSVTokeniser DN_CSV_TokeniserInit(DN_Str8 string, char delimiter)
 | |
| {
 | |
|   DN_CSVTokeniser result = {};
 | |
|   result.string          = string;
 | |
|   result.delimiter       = delimiter;
 | |
|   return result;
 | |
| }
 | |
| 
 | |
| static bool DN_CSV_TokeniserValid(DN_CSVTokeniser *tokeniser)
 | |
| {
 | |
|   bool result = tokeniser && !tokeniser->bad;
 | |
|   return result;
 | |
| }
 | |
| 
 | |
| static bool DN_CSV_TokeniserNextRow(DN_CSVTokeniser *tokeniser)
 | |
| {
 | |
|   bool result = false;
 | |
|   if (DN_CSV_TokeniserValid(tokeniser) && DN_Str8HasData(tokeniser->string)) {
 | |
|     // NOTE: First time querying row iterator is nil, let tokeniser advance
 | |
|     if (tokeniser->it) {
 | |
|       // NOTE: Only advance the tokeniser if we're at the end of the line and
 | |
|       // there's more to tokenise.
 | |
|       char const *end = tokeniser->string.data + tokeniser->string.size;
 | |
|       if (tokeniser->it != end && tokeniser->end_of_line) {
 | |
|         tokeniser->end_of_line = false;
 | |
|         result                 = true;
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   return result;
 | |
| }
 | |
| 
 | |
| static DN_Str8 DN_CSV_TokeniserNextField(DN_CSVTokeniser *tokeniser)
 | |
| {
 | |
|   DN_Str8 result = {};
 | |
|   if (!DN_CSV_TokeniserValid(tokeniser))
 | |
|     return result;
 | |
| 
 | |
|   if (!DN_Str8HasData(tokeniser->string)) {
 | |
|     tokeniser->bad = true;
 | |
|     return result;
 | |
|   }
 | |
| 
 | |
|   // NOTE: First time tokeniser is invoked with a string, set up initial state.
 | |
|   char const *string_end = tokeniser->string.data + tokeniser->string.size;
 | |
|   if (!tokeniser->it) {
 | |
|     tokeniser->it = tokeniser->string.data;
 | |
|     // NOTE: Skip any leading new lines
 | |
|     while (tokeniser->it[0] == '\n' || tokeniser->it[0] == '\r')
 | |
|       if (++tokeniser->it == string_end)
 | |
|         break;
 | |
|   }
 | |
| 
 | |
|   // NOTE: Tokeniser pointing at end, no more valid data to parse.
 | |
|   if (tokeniser->it == string_end)
 | |
|     return result;
 | |
| 
 | |
|   // NOTE: Scan forward until the next control character.
 | |
|   // 1. '"'                   Double quoted field,  extract everything between the quotes.
 | |
|   // 2. tokeniser->delimiter  End of the field,     extract everything leading up to the delimiter.
 | |
|   // 3. '\n'                  Last field in record, extract everything leading up the the new line.
 | |
|   char const *begin = tokeniser->it;
 | |
|   while (tokeniser->it != string_end && (tokeniser->it[0] != '"' &&
 | |
|                                                 tokeniser->it[0] != tokeniser->delimiter &&
 | |
|                                                 tokeniser->it[0] != '\n'))
 | |
|     tokeniser->it++;
 | |
| 
 | |
|   bool quoted_field = (tokeniser->it != string_end) && tokeniser->it[0] == '"';
 | |
|   if (quoted_field) {
 | |
|     begin = ++tokeniser->it; // Begin after the quote
 | |
| 
 | |
|   // NOTE: Scan forward until the next '"' which marks the end
 | |
|   // of the field unless it is escaped by another '"'.
 | |
|   find_next_quote:
 | |
|     while (tokeniser->it != string_end && tokeniser->it[0] != '"')
 | |
|       tokeniser->it++;
 | |
| 
 | |
|     // NOTE: If we encounter a '"' right after, the quotes were escaped
 | |
|     // and we need to skip to the next instance of a '"'.
 | |
|     if (tokeniser->it != string_end && tokeniser->it + 1 != string_end && tokeniser->it[1] == '"') {
 | |
|       tokeniser->it += 2;
 | |
|       goto find_next_quote;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   // NOTE: Mark the end of the field
 | |
|   char const *end        = tokeniser->it;
 | |
|   tokeniser->end_of_line = tokeniser->it == string_end || end[0] == '\n';
 | |
| 
 | |
|   // NOTE: In files with \r\n style new lines ensure that we don't include
 | |
|   // the \r byte in the CSV field we produce.
 | |
|   if (end != string_end && end[0] == '\n') {
 | |
|     DN_Assert((uintptr_t)(end - 1) > (uintptr_t)tokeniser->string.data &&
 | |
|                "Internal error: The string iterator is pointing behind the start of the string we're reading");
 | |
|     if (end[-1] == '\r')
 | |
|       end = end - 1;
 | |
|   }
 | |
| 
 | |
|   // NOTE: Quoted fields may have whitespace after the closing quote, we skip
 | |
|   // until we reach the field terminator.
 | |
|   if (quoted_field)
 | |
|     while (tokeniser->it != string_end && (tokeniser->it[0] != tokeniser->delimiter &&
 | |
|                                                   tokeniser->it[0] != '\n'))
 | |
|       tokeniser->it++;
 | |
| 
 | |
|   // NOTE: Advance the tokeniser past the field terminator.
 | |
|   if (tokeniser->it != string_end)
 | |
|     tokeniser->it++;
 | |
| 
 | |
|   // NOTE: Generate the record
 | |
|   result.data = DN_Cast(char *) begin;
 | |
|   result.size = DN_Cast(int)(end - begin);
 | |
|   return result;
 | |
| }
 | |
| 
 | |
| static DN_Str8 DN_CSV_TokeniserNextColumn(DN_CSVTokeniser *tokeniser)
 | |
| {
 | |
|   DN_Str8 result = {};
 | |
|   if (!DN_CSV_TokeniserValid(tokeniser))
 | |
|     return result;
 | |
| 
 | |
|   // NOTE: End of line, the user must explicitly advance to the next row
 | |
|   if (tokeniser->end_of_line)
 | |
|     return result;
 | |
| 
 | |
|   // NOTE: Advance tokeniser to the next field in the row
 | |
|   result = DN_CSV_TokeniserNextField(tokeniser);
 | |
|   return result;
 | |
| }
 | |
| 
 | |
| static void DN_CSV_TokeniserSkipLine(DN_CSVTokeniser *tokeniser)
 | |
| {
 | |
|   while (DN_CSV_TokeniserValid(tokeniser) && !tokeniser->end_of_line)
 | |
|     DN_CSV_TokeniserNextColumn(tokeniser);
 | |
|   DN_CSV_TokeniserNextRow(tokeniser);
 | |
| }
 | |
| 
 | |
| static int DN_CSV_TokeniserNextN(DN_CSVTokeniser *tokeniser, DN_Str8 *fields, int fields_size, bool column_iterator)
 | |
| {
 | |
|   if (!DN_CSV_TokeniserValid(tokeniser) || !fields || fields_size <= 0)
 | |
|     return 0;
 | |
| 
 | |
|   int result = 0;
 | |
|   for (; result < fields_size; result++) {
 | |
|     fields[result] = column_iterator ? DN_CSV_TokeniserNextColumn(tokeniser) : DN_CSV_TokeniserNextField(tokeniser);
 | |
|     if (!DN_CSV_TokeniserValid(tokeniser) || !DN_Str8HasData(fields[result]))
 | |
|       break;
 | |
|   }
 | |
| 
 | |
|   return result;
 | |
| }
 | |
| 
 | |
| DN_MSVC_WARNING_PUSH
 | |
| DN_MSVC_WARNING_DISABLE(4505) // 'x': unreferenced function with internal linkage has been removed
 | |
| static int DN_CSV_TokeniserNextColumnN(DN_CSVTokeniser *tokeniser, DN_Str8 *fields, int fields_size)
 | |
| {
 | |
|   int result = DN_CSV_TokeniserNextN(tokeniser, fields, fields_size, true /*column_iterator*/);
 | |
|   return result;
 | |
| }
 | |
| 
 | |
| static int DN_CSV_TokeniserNextFieldN(DN_CSVTokeniser *tokeniser, DN_Str8 *fields, int fields_size)
 | |
| {
 | |
|   int result = DN_CSV_TokeniserNextN(tokeniser, fields, fields_size, false /*column_iterator*/);
 | |
|   return result;
 | |
| }
 | |
| 
 | |
| static void DN_CSV_TokeniserSkipLineN(DN_CSVTokeniser *tokeniser, int count)
 | |
| {
 | |
|   for (int i = 0; i < count && DN_CSV_TokeniserValid(tokeniser); i++)
 | |
|     DN_CSV_TokeniserSkipLine(tokeniser);
 | |
| }
 | |
| 
 | |
| static void DN_CSV_PackU64(DN_CSVPack *pack, DN_CSVSerialise serialise, DN_U64 *value)
 | |
| {
 | |
|   if (serialise == DN_CSVSerialise_Read) {
 | |
|     DN_Str8            csv_value = DN_CSV_TokeniserNextColumn(&pack->read_tokeniser);
 | |
|     DN_Str8ToU64Result to_u64    = DN_Str8ToU64(csv_value, 0);
 | |
|     DN_Assert(to_u64.success);
 | |
|     *value = to_u64.value;
 | |
|   } else {
 | |
|     DN_Str8BuilderAppendF(&pack->write_builder, "%s%" PRIu64, pack->write_column++ ? "," : "", *value);
 | |
|   }
 | |
| }
 | |
| 
 | |
| static void DN_CSV_PackI64(DN_CSVPack *pack, DN_CSVSerialise serialise, DN_I64 *value)
 | |
| {
 | |
|   if (serialise == DN_CSVSerialise_Read) {
 | |
|     DN_Str8            csv_value = DN_CSV_TokeniserNextColumn(&pack->read_tokeniser);
 | |
|     DN_Str8ToI64Result to_i64    = DN_Str8ToI64(csv_value, 0);
 | |
|     DN_Assert(to_i64.success);
 | |
|     *value = to_i64.value;
 | |
|   } else {
 | |
|     DN_Str8BuilderAppendF(&pack->write_builder, "%s%" PRIu64, pack->write_column++ ? "," : "", *value);
 | |
|   }
 | |
| }
 | |
| 
 | |
| static void DN_CSV_PackI32(DN_CSVPack *pack, DN_CSVSerialise serialise, DN_I32 *value)
 | |
| {
 | |
|   DN_I64 u64 = *value;
 | |
|   DN_CSV_PackI64(pack, serialise, &u64);
 | |
|   if (serialise == DN_CSVSerialise_Read)
 | |
|     *value = DN_SaturateCastI64ToI32(u64);
 | |
| }
 | |
| 
 | |
| static void DN_CSV_PackI16(DN_CSVPack *pack, DN_CSVSerialise serialise, DN_I16 *value)
 | |
| {
 | |
|   DN_I64 u64 = *value;
 | |
|   DN_CSV_PackI64(pack, serialise, &u64);
 | |
|   if (serialise == DN_CSVSerialise_Read)
 | |
|     *value = DN_SaturateCastI64ToI16(u64);
 | |
| }
 | |
| 
 | |
| static void DN_CSV_PackI8(DN_CSVPack *pack, DN_CSVSerialise serialise, DN_I8 *value)
 | |
| {
 | |
|   DN_I64 u64 = *value;
 | |
|   DN_CSV_PackI64(pack, serialise, &u64);
 | |
|   if (serialise == DN_CSVSerialise_Read)
 | |
|     *value = DN_SaturateCastI64ToI8(u64);
 | |
| }
 | |
| 
 | |
| 
 | |
| static void DN_CSV_PackU32(DN_CSVPack *pack, DN_CSVSerialise serialise, DN_U32 *value)
 | |
| {
 | |
|   DN_U64 u64 = *value;
 | |
|   DN_CSV_PackU64(pack, serialise, &u64);
 | |
|   if (serialise == DN_CSVSerialise_Read)
 | |
|     *value = DN_SaturateCastU64ToU32(u64);
 | |
| }
 | |
| 
 | |
| static void DN_CSV_PackU16(DN_CSVPack *pack, DN_CSVSerialise serialise, DN_U16 *value)
 | |
| {
 | |
|   DN_U64 u64 = *value;
 | |
|   DN_CSV_PackU64(pack, serialise, &u64);
 | |
|   if (serialise == DN_CSVSerialise_Read)
 | |
|     *value = DN_SaturateCastU64ToU16(u64);
 | |
| }
 | |
| 
 | |
| static void DN_CSV_PackBoolAsU64(DN_CSVPack *pack, DN_CSVSerialise serialise, bool *value)
 | |
| {
 | |
|   DN_U64 u64 = *value;
 | |
|   DN_CSV_PackU64(pack, serialise, &u64);
 | |
|   if (serialise == DN_CSVSerialise_Read)
 | |
|     *value = u64 ? 1 : 0;
 | |
| }
 | |
| 
 | |
| static void DN_CSV_PackStr8(DN_CSVPack *pack, DN_CSVSerialise serialise, DN_Str8 *str8, DN_Arena *arena)
 | |
| {
 | |
|   if (serialise == DN_CSVSerialise_Read) {
 | |
|     DN_Str8 csv_value = DN_CSV_TokeniserNextColumn(&pack->read_tokeniser);
 | |
|     *str8             = DN_Str8FromStr8(arena, csv_value);
 | |
|   } else {
 | |
|     DN_Str8BuilderAppendF(&pack->write_builder, "%s%.*s", pack->write_column++ ? "," : "", DN_Str8PrintFmt(*str8));
 | |
|   }
 | |
| }
 | |
| 
 | |
| static void DN_CSV_PackBuffer(DN_CSVPack *pack, DN_CSVSerialise serialise, void *dest, size_t *size)
 | |
| {
 | |
|   if (serialise == DN_CSVSerialise_Read) {
 | |
|     DN_Str8 csv_value = DN_CSV_TokeniserNextColumn(&pack->read_tokeniser);
 | |
|     *size             = DN_Min(*size, csv_value.size);
 | |
|     DN_Memcpy(dest, csv_value.data, *size);
 | |
|   } else {
 | |
|     DN_Str8BuilderAppendF(&pack->write_builder, "%s%.*s", pack->write_column++ ? "," : "", DN_Cast(int)(*size), dest);
 | |
|   }
 | |
| }
 | |
| 
 | |
| static void DN_CSV_PackBufferWithMax(DN_CSVPack *pack, DN_CSVSerialise serialise, void *dest, size_t *size, size_t max)
 | |
| {
 | |
|   if (serialise == DN_CSVSerialise_Read)
 | |
|     *size = max;
 | |
|   DN_CSV_PackBuffer(pack, serialise, dest, size);
 | |
| }
 | |
| 
 | |
| static bool DN_CSV_PackNewLine(DN_CSVPack *pack, DN_CSVSerialise serialise)
 | |
| {
 | |
|   bool result = true;
 | |
|   if (serialise == DN_CSVSerialise_Read) {
 | |
|     result = DN_CSV_TokeniserNextRow(&pack->read_tokeniser);
 | |
|   } else {
 | |
|     pack->write_column = 0;
 | |
|     result             = DN_Str8BuilderAppendRef(&pack->write_builder, DN_Str8Lit("\n"));
 | |
|   }
 | |
|   return result;
 | |
| }
 | |
| DN_MSVC_WARNING_POP
 |