Update to latest DN library

This commit is contained in:
2026-06-18 16:41:12 +10:00
parent cbf7416220
commit 2659f0316f
17 changed files with 602 additions and 140 deletions
+76 -2
View File
@@ -207,6 +207,14 @@ struct DN_OSThreadLane
void* shared_mem;
};
struct DN_OSThreadLaneway
{
DN_OSThread* threads;
DN_USize threads_count;
DN_UPtr* shared_mem;
DN_OSBarrier barrier;
};
struct DN_OSThread
{
DN_Str8x64 name;
@@ -218,6 +226,7 @@ struct DN_OSThread
void *user_context;
DN_OSThreadFunc *func;
DN_OSSemaphore init_semaphore;
DN_TCInitArgs tc_init_args;
};
// NOTE: DN_OSHttp
@@ -422,14 +431,79 @@ DN_API bool DN_OS_ConditionVariableWaitUntil (D
DN_API void DN_OS_ConditionVariableSignal (DN_OSConditionVariable *cv);
DN_API void DN_OS_ConditionVariableBroadcast (DN_OSConditionVariable *cv);
DN_API bool DN_OS_ThreadInit (DN_OSThread *thread, DN_OSThreadFunc *func, DN_OSThreadLane *lane, void *user_context);
DN_API bool DN_OS_ThreadInit (DN_OSThread *thread, DN_OSThreadFunc *func, DN_OSThreadLane *lane, DN_TCInitArgs tc_init_args, void *user_context);
DN_API bool DN_OS_ThreadJoin (DN_OSThread *thread, DN_TCDeinitArenas deinit_arenas);
DN_API DN_U32 DN_OS_ThreadID ();
DN_API void DN_OS_ThreadSetNameFmt (char const *fmt, ...);
// NOTE: Thread lanes provide an abstraction to represent the concept of programming a CPU like a
// GPU, e.g. SIMT (Single Instruction Multiple Threads). The lane terminology is popularised by Ryan
// Fleury. SIMT is formally defined as
//
// Threads are grouped into warps/wavefronts (typically 32 or 64 threads) that execute the same
// instruction in lockstep, but each thread operates on different data and maintains its own state
//
// The individual threads in a wavefront on the CPU side are colloquially dubbed "lanes" and a
// thread lane here contains the necessary state to facilitate this such as the current index in the
// wavefront and synchronisation primitives to coordinate the different lanes together.
//
// The idea is to write code in a single-threaded manner (linear execution) but across multiple
// threads so that the default is all execution paths are inherently multi-threaded by default. Opt
// out of parallelism instead of opt in. This optimises for the trend of core counts increasing
// whilst clock counts remain static.
//
// A laneway is a helper function to initialise the number of requested OS threads/lanes upfront and
// setup the required synchronisation primitives. It can then be dispatched all the threads which
// start executing the `entry_point` in parallel.
//
// API
// DN_OS_ThreadLaneSync
// A blocking call to synchronise the program-counter of all other lanes in the laneway to this
// function call invocation (using an OS barrier). Optionally pass in the pointer to a pointer
// `ptr_to_share` to broadcast the pointer from one lanes to the others. The lane that wishes
// to broadcast the pointer must have a non-null pointer, all other lanes must pass in a
// non-null pointer. A typical use case might look like:
/*
DN_OSThreadLane *lane = DN_OS_TCThreadLane(); // Get lane from current (t)hread (c)context
// NOTE: Allocate buffer in lane 0
DN_U8 *buffer = nullptr;
if (lane->index == 0)
buffer = DN_ArenaNewArray(DN_TCMainArena(), DN_U8, DN_Gigabytes(1), DN_ZMem_No);
// NOTE: Lane 0 broadcasts the `buffer` pointer to lane 1..N
DN_OS_ThreadLaneSync(lane, &buffer);
// NOTE: We use LaneRange to divide the buffer into equal sized chunks that each lane can
// write into without clobbering over each other.
DN_V2USize range = DN_OS_ThreadLaneRange(lane, DN_Gigabytes(1));
for (DN_USize index = range.begin; index < range.end; index++) { buffer[index] = index; }
*/
// In this example, lane 0 will allocate a 1GiB buffer pass in a `buffer` to
// DN_OS_ThreadLaneSync` that is non-null. Lanes 1->N will skip the branch (because their lanes
// indexes are 1..N) and invoke `DN_OS_ThreadLaneSync` with a nullptr `buffer`. After the
// blocking call is complete, lanes 0->N will now have synchronised the `buffer` pointer and all
// lanes point to the 1GiB range allocated in lane 0's allocator.
//
// Additionally we demonstrate `DN_OS_ThreadLaneRange` which does math behind the scenes to
// divide the buffer up and assign each lane their own indices in the buffer that they can work
// on in parallel without clobbering each others work.
//
// DN_OS_ThreadLaneRange
// Calculates the range of values the current lane in the laneway should execute. For example if
// you have 128 items and 16 threads each lane will receive the following `DN_V2USize` range:
// Lane 0 => [0, 8)
// Lane 1 => [8, 16)
// ...
// Lane 16 => [120, 128)
DN_API DN_OSThreadLane DN_OS_ThreadLaneInit (DN_USize index, DN_USize thread_count, DN_OSBarrier barrier, DN_UPtr *share_mem);
DN_API void DN_OS_ThreadLaneSync (DN_OSThreadLane *lane, void **ptr_to_share);
DN_API DN_V2USize DN_OS_ThreadLaneRange (DN_OSThreadLane *lane, DN_USize values_count);
DN_API DN_V2USize DN_OS_ThreadLaneRange (DN_OSThreadLane const *lane, DN_USize values_count);
DN_API DN_OSThreadLaneway DN_OS_ThreadLanewayFromArgs (DN_OSThread* threads, DN_USize threads_count, DN_UPtr* shared_mem);
DN_API DN_OSThreadLaneway DN_OS_ThreadLanewayFromArena (DN_USize threads_count, DN_Arena* arena);
DN_API void DN_OS_ThreadLanewayDispatch (DN_OSThreadLaneway *laneway, DN_OSThreadFunc *entry_point, DN_TCInitArgs tc_init_args, void *user_context);
DN_API void DN_OS_ThreadLanewayJoin (DN_OSThreadLaneway *laneway, DN_TCDeinitArenas deinit_arenas);
DN_API DN_OSThreadLane* DN_OS_TCThreadLane ();
DN_API void DN_OS_TCThreadLaneSync (void **ptr_to_share);