Make tri rasterisation into SIMD/non-SIMD paths

2017-05-31 16:40:13 +10:00 · 2017-05-31 16:40:13 +10:00 · 47d606e297
commit 47d606e297
parent bb5fc03bda
7 changed files with 478 additions and 430 deletions
--- a/src/DTRenderer.cpp
+++ b/src/DTRenderer.cpp
@ -8,6 +8,8 @@
 #include "dqn.h"
 #include <math.h>

+PlatformFlags globalDTRPlatformFlags;
+
 // #include <algorithm>
 void CompAssignment(DTRRenderBuffer *const renderBuffer, PlatformInput *const input,
                    PlatformMemory *const memory)
@ -946,8 +948,9 @@ extern "C" void DTR_Update(PlatformRenderBuffer *const platformRenderBuffer,
 	////////////////////////////////////////////////////////////////////////////
 	// Initialisation
 	////////////////////////////////////////////////////////////////////////////
-	DTRState *state = (DTRState *)memory->context;
-	if (input->executableReloaded)
+	DTRState *state        = (DTRState *)memory->context;
+	globalDTRPlatformFlags = input->flags;
+	if (globalDTRPlatformFlags.executableReloaded)
 	{
 		DTR_DEBUG_EP_PROFILE_END();
 		DTR_DEBUG_EP_PROFILE_START();
@ -1046,77 +1049,7 @@ extern "C" void DTR_Update(PlatformRenderBuffer *const platformRenderBuffer,
 		DTRMesh *const mesh = &state->mesh;
 		DqnV3 modelP = DqnV3_3f(renderBuffer.width * 0.5f, renderBuffer.height * 0.5f, 0);

-		for (u32 i = 0; i < mesh->numFaces; i++)
-		{
-			DTRMeshFace face = mesh->faces[i];
-			DQN_ASSERT(face.numVertexIndex == 3);
-			i32 vertAIndex = face.vertexIndex[0];
-			i32 vertBIndex = face.vertexIndex[1];
-			i32 vertCIndex = face.vertexIndex[2];
-
-			DqnV4 vertA = mesh->vertexes[vertAIndex];
-			DqnV4 vertB = mesh->vertexes[vertBIndex];
-			DqnV4 vertC = mesh->vertexes[vertCIndex];
-			// TODO(doyle): Some models have -ve indexes to refer to relative
-			// vertices. We should resolve that to positive indexes at run time.
-			DQN_ASSERT(vertAIndex < (i32)mesh->numVertexes);
-			DQN_ASSERT(vertBIndex < (i32)mesh->numVertexes);
-			DQN_ASSERT(vertCIndex < (i32)mesh->numVertexes);
-
-			DqnV4 vertAB = vertB - vertA;
-			DqnV4 vertAC = vertC - vertA;
-			DqnV3 normal = DqnV3_Cross(vertAC.xyz, vertAB.xyz);
-
-			f32 intensity = DqnV3_Dot(DqnV3_Normalise(normal), LIGHT);
-			if (intensity < 0) continue;
-			DqnV4 modelCol = DqnV4_4f(1, 1, 1, 1);
-			modelCol.rgb *= DQN_ABS(intensity);
-
-			DqnV3 screenVA = (vertA.xyz * MODEL_SCALE) + modelP;
-			DqnV3 screenVB = (vertB.xyz * MODEL_SCALE) + modelP;
-			DqnV3 screenVC = (vertC.xyz * MODEL_SCALE) + modelP;
-
-			// TODO(doyle): Why do we need rounding here? Maybe it's because
-			// I don't do any interpolation in the triangle routine for jagged
-			// edges.
-			screenVA.x = (f32)(i32)(screenVA.x + 0.5f);
-			screenVA.y = (f32)(i32)(screenVA.y + 0.5f);
-			screenVB.x = (f32)(i32)(screenVB.x + 0.5f);
-			screenVB.y = (f32)(i32)(screenVB.y + 0.5f);
-			screenVC.x = (f32)(i32)(screenVC.x + 0.5f);
-			screenVC.y = (f32)(i32)(screenVC.y + 0.5f);
-
-			i32 textureAIndex = face.texIndex[0];
-			i32 textureBIndex = face.texIndex[1];
-			i32 textureCIndex = face.texIndex[2];
-
-			DqnV2 texA = mesh->texUV[textureAIndex].xy;
-			DqnV2 texB = mesh->texUV[textureBIndex].xy;
-			DqnV2 texC = mesh->texUV[textureCIndex].xy;
-			DQN_ASSERT(textureAIndex < (i32)mesh->numTexUV);
-			DQN_ASSERT(textureBIndex < (i32)mesh->numTexUV);
-			DQN_ASSERT(textureCIndex < (i32)mesh->numTexUV);
-
-			bool DEBUG_SIMPLE_MODE = false;
-			if (DTR_DEBUG && DEBUG_SIMPLE_MODE)
-			{
-				DTRRender_Triangle(&renderBuffer, screenVA, screenVB, screenVC, modelCol);
-			}
-			else
-			{
-				DTRRender_TexturedTriangle(input, &renderBuffer, screenVA, screenVB, screenVC, texA,
-				                           texB, texC, &state->mesh.tex, modelCol);
-			}
-
-			bool DEBUG_WIREFRAME = false;
-			if (DTR_DEBUG && DEBUG_WIREFRAME)
-			{
-				DqnV4 wireColor = DqnV4_4f(1.0f, 1.0f, 1.0f, 0.01f);
-				DTRRender_Line(&renderBuffer, DqnV2i_V2(screenVA.xy), DqnV2i_V2(screenVB.xy), wireColor);
-				DTRRender_Line(&renderBuffer, DqnV2i_V2(screenVB.xy), DqnV2i_V2(screenVC.xy), wireColor);
-				DTRRender_Line(&renderBuffer, DqnV2i_V2(screenVC.xy), DqnV2i_V2(screenVA.xy), wireColor);
-			}
-		}
+		DTRRender_Mesh(&renderBuffer, mesh, modelP, MODEL_SCALE, LIGHT);
 	}

 	// Rect drawing
--- a/src/DTRenderer.h
+++ b/src/DTRenderer.h
@ -14,4 +14,6 @@ typedef struct DTRState
 	DTRBitmap bitmap;
 	DTRMesh   mesh;
 } DTRState;
+
+extern PlatformFlags globalDTRPlatformFlags;
 #endif
--- a/src/DTRendererDebug.cpp
+++ b/src/DTRendererDebug.cpp
@ -119,7 +119,7 @@ void inline DTRDebug_BeginCycleCount(enum DTRDebugCycleCount tag)
 {
 	if (DTR_DEBUG_PROFILING)
 	{
-		if (globalDebug.input && globalDebug.input->canUseRdtsc)
+		if (globalDTRPlatformFlags.canUseRdtsc)
 		{
 			DTRDebugCycles *const cycles = &globalDebug.cycles[tag];
 			cycles->tmpStartCycles       = __rdtsc();
@ -132,7 +132,7 @@ void inline DTRDebug_EndCycleCount(enum DTRDebugCycleCount tag)
 {
 	if (DTR_DEBUG_PROFILING)
 	{
-		if (globalDebug.input && globalDebug.input->canUseRdtsc)
+		if (globalDTRPlatformFlags.canUseRdtsc)
 		{
 			DTRDebugCycles *const cycles = &globalDebug.cycles[tag];
 			cycles->totalCycles += __rdtsc() - cycles->tmpStartCycles;
@ -208,8 +208,8 @@ void DTRDebug_Update(DTRState *const state,
 		DTRDebug_PushText("MouseRBtn: %s", (input->mouse.rightBtn.endedDown) ? "true" : "false");
 		DTRDebug_PushText("");

-		DTRDebug_PushText("SSE2Support: %s", (input->canUseSSE2) ? "true" : "false");
-		DTRDebug_PushText("RDTSCSupport: %s", (input->canUseRdtsc) ? "true" : "false");
+		DTRDebug_PushText("SSE2Support: %s", (globalDTRPlatformFlags.canUseSSE2) ? "true" : "false");
+		DTRDebug_PushText("RDTSCSupport: %s", (globalDTRPlatformFlags.canUseRdtsc) ? "true" : "false");
 		DTRDebug_PushText("");

 		DTRDebug_PushText("TotalSetPixels: %'lld",    debug->totalSetPixels);
@ -226,7 +226,7 @@ void DTRDebug_Update(DTRState *const state,
 			u64 avgCycles   = cycles->totalCycles / invocations;
 			DTRDebug_PushText("%d: %'lld avg cycles", i, avgCycles);

-			*cycles = emptyDebugCycles;
+			// *cycles = emptyDebugCycles;
 		}
 		DTRDebug_PushText("");

--- a/src/DTRendererPlatform.h
+++ b/src/DTRendererPlatform.h
@ -83,13 +83,18 @@ typedef struct PlatformMouse
 	KeyState rightBtn;
 } PlatformMouse;

+typedef struct PlatformFlags
+{
+	bool executableReloaded;
+	bool canUseRdtsc;
+	bool canUseSSE2;
+} PlatformFlags;
+
 typedef struct PlatformInput
 {
-	f32  deltaForFrame;
-	f64  timeNowInS;
-	bool executableReloaded;
-	bool canUseSSE2;
-	bool canUseRdtsc;
+	f32           deltaForFrame;
+	f64           timeNowInS;
+	PlatformFlags flags;

 	PlatformAPI   api;
 	PlatformMouse mouse;
--- a/src/DTRendererRender.cpp
+++ b/src/DTRendererRender.cpp
@ -554,8 +554,351 @@ FILE_SCOPE void DebugBarycentricInternal(DqnV2 p, DqnV2 a, DqnV2 b, DqnV2 c, f32
 	*u        = 1.0f - *v - *w;
 }

-void DTRRender_TexturedTriangle(PlatformInput *const input,
-                                DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3,
+inline void RasteriseTexturedTriangle(DTRRenderBuffer *const renderBuffer, const DqnV3 p1,
+                                      const DqnV3 p2, const DqnV3 p3, const DqnV2 uv1,
+                                      const DqnV2 uv2, const DqnV2 uv3, DTRBitmap *const texture,
+                                      const DqnV4 color)
+{
+	DqnV2i max = DqnV2i_2f(DQN_MAX(DQN_MAX(p1.x, p2.x), p3.x), DQN_MAX(DQN_MAX(p1.y, p2.y), p3.y));
+	DqnV2i min = DqnV2i_2f(DQN_MIN(DQN_MIN(p1.x, p2.x), p3.x), DQN_MIN(DQN_MIN(p1.y, p2.y), p3.y));
+	min.x      = DQN_MAX(min.x, 0);
+	min.y      = DQN_MAX(min.y, 0);
+	max.x      = DQN_MIN(max.x, renderBuffer->width - 1);
+	max.y      = DQN_MIN(max.y, renderBuffer->height - 1);
+
+	const u32 zBufferPitch = renderBuffer->width;
+	const DqnV3 a          = p1;
+	const DqnV3 b          = p2;
+	const DqnV3 c          = p3;
+
+	DqnV2i startP         = min;
+	f32 signedArea1       = ((b.x - a.x) * (startP.y - a.y)) - ((b.y - a.y) * (startP.x - a.x));
+	f32 signedArea1DeltaX = a.y - b.y;
+	f32 signedArea1DeltaY = b.x - a.x;
+
+	f32 signedArea2       = ((c.x - b.x) * (startP.y - b.y)) - ((c.y - b.y) * (startP.x - b.x));
+	f32 signedArea2DeltaX = b.y - c.y;
+	f32 signedArea2DeltaY = c.x - b.x;
+
+	f32 signedArea3       = ((a.x - c.x) * (startP.y - c.y)) - ((a.y - c.y) * (startP.x - c.x));
+	f32 signedArea3DeltaX = c.y - a.y;
+	f32 signedArea3DeltaY = a.x - c.x;
+
+	f32 signedAreaParallelogram = signedArea1 + signedArea2 + signedArea3;
+	if (signedAreaParallelogram == 0) return;
+	f32 invSignedAreaParallelogram = 1 / signedAreaParallelogram;
+
+	for (i32 bufferY = min.y; bufferY < max.y; bufferY++)
+	{
+		f32 signedArea1Row = signedArea1;
+		f32 signedArea2Row = signedArea2;
+		f32 signedArea3Row = signedArea3;
+
+		for (i32 bufferX = min.x; bufferX < max.x; bufferX++)
+		{
+			if (signedArea1Row >= 0 && signedArea2Row >= 0 && signedArea3Row >= 0)
+			{
+				f32 barycentricB = signedArea3Row * invSignedAreaParallelogram;
+				f32 barycentricC = signedArea1Row * invSignedAreaParallelogram;
+
+				if (DTR_DEBUG)
+				{
+					const f32 EPSILON = 0.1f;
+
+					f32 debugSignedArea1 = ((b.x - a.x) * (bufferY - a.y)) - ((b.y - a.y) * (bufferX - a.x));
+					f32 debugSignedArea2 = ((c.x - b.x) * (bufferY - b.y)) - ((c.y - b.y) * (bufferX - b.x));
+					f32 debugSignedArea3 = ((a.x - c.x) * (bufferY - c.y)) - ((a.y - c.y) * (bufferX - c.x));
+
+					f32 deltaSignedArea1 = DQN_ABS(debugSignedArea1 - signedArea1Row);
+					f32 deltaSignedArea2 = DQN_ABS(debugSignedArea2 - signedArea2Row);
+					f32 deltaSignedArea3 = DQN_ABS(debugSignedArea3 - signedArea3Row);
+					DQN_ASSERT(deltaSignedArea1 < EPSILON && deltaSignedArea2 < EPSILON &&
+					           deltaSignedArea3 < EPSILON)
+
+					f32 debugBarycentricA, debugBarycentricB, debugBarycentricC;
+					DebugBarycentricInternal(DqnV2_2i(bufferX, bufferY), a.xy, b.xy, c.xy,
+					                         &debugBarycentricA, &debugBarycentricB,
+					                         &debugBarycentricC);
+
+					f32 deltaBaryB = DQN_ABS(barycentricB - debugBarycentricB);
+					f32 deltaBaryC = DQN_ABS(barycentricC - debugBarycentricC);
+
+					DQN_ASSERT(deltaBaryB < EPSILON && deltaBaryC < EPSILON)
+				}
+
+				i32 zBufferIndex = bufferX + (bufferY * zBufferPitch);
+				f32 pixelZValue = a.z + (barycentricB * (b.z - a.z)) + (barycentricC * (c.z - a.z));
+				f32 currZValue  = renderBuffer->zBuffer[zBufferIndex];
+				DQN_ASSERT(zBufferIndex < (renderBuffer->width * renderBuffer->height));
+
+				if (pixelZValue > currZValue)
+				{
+					renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
+					if (texture)
+					{
+						u8 *texturePtr         = texture->memory;
+						const u32 texturePitch = texture->bytesPerPixel * texture->dim.w;
+
+						DqnV2 uv =
+						    uv1 + ((uv2 - uv1) * barycentricB) + ((uv3 - uv1) * barycentricC);
+
+						const f32 EPSILON = 0.1f;
+						DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON);
+						DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON);
+
+						uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f);
+						uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f);
+
+						f32 texelXf = uv.x * texture->dim.w;
+						f32 texelYf = uv.y * texture->dim.h;
+						DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w);
+						DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h);
+
+						i32 texelX = (i32)texelXf;
+						i32 texelY = (i32)texelYf;
+
+						u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) +
+						                      (texelY * texturePitch));
+
+						DqnV4 color1;
+						color1.a = (f32)(texel1 >> 24);
+						color1.b = (f32)((texel1 >> 16) & 0xFF);
+						color1.g = (f32)((texel1 >> 8) & 0xFF);
+						color1.r = (f32)((texel1 >> 0) & 0xFF);
+
+						color1 *= DTRRENDER_INV_255;
+						color1      = DTRRender_SRGB1ToLinearSpaceV4(color1);
+						DqnV4 blend = color * color1;
+						SetPixel(renderBuffer, bufferX, bufferY, blend, ColorSpace_Linear);
+					}
+					else
+					{
+						SetPixel(renderBuffer, bufferX, bufferY, color, ColorSpace_Linear);
+					}
+				}
+			}
+
+			signedArea1Row += signedArea1DeltaX;
+			signedArea2Row += signedArea2DeltaX;
+			signedArea3Row += signedArea3DeltaX;
+		}
+
+		signedArea1 += signedArea1DeltaY;
+		signedArea2 += signedArea2DeltaY;
+		signedArea3 += signedArea3DeltaY;
+	}
+}
+
+inline void SIMDRasteriseTexturedTriangle(DTRRenderBuffer *const renderBuffer, const DqnV3 p1,
+                                          const DqnV3 p2, const DqnV3 p3, const DqnV2 uv1,
+                                          const DqnV2 uv2, const DqnV2 uv3,
+                                          DTRBitmap *const texture, const DqnV4 color)
+{
+	////////////////////////////////////////////////////////////////////////////
+	// Calculate Bounding Box
+	////////////////////////////////////////////////////////////////////////////
+	DqnV2i max = DqnV2i_2f(DQN_MAX(DQN_MAX(p1.x, p2.x), p3.x),
+	                       DQN_MAX(DQN_MAX(p1.y, p2.y), p3.y));
+	DqnV2i min = DqnV2i_2f(DQN_MIN(DQN_MIN(p1.x, p2.x), p3.x),
+	                       DQN_MIN(DQN_MIN(p1.y, p2.y), p3.y));
+	min.x = DQN_MAX(min.x, 0);
+	min.y = DQN_MAX(min.y, 0);
+	max.x = DQN_MIN(max.x, renderBuffer->width - 1);
+	max.y = DQN_MIN(max.y, renderBuffer->height - 1);
+
+	const u32 zBufferPitch = renderBuffer->width;
+	const DqnV3 a          = p1;
+	const DqnV3 b          = p2;
+	const DqnV3 c          = p3;
+
+	DqnV2i startP         = min;
+	f32 signedAreaC       = ((b.x - a.x) * (startP.y - a.y)) - ((b.y - a.y) * (startP.x - a.x));
+	f32 signedAreaCDeltaX = a.y - b.y;
+	f32 signedAreaCDeltaY = b.x - a.x;
+
+	f32 signedAreaA       = ((c.x - b.x) * (startP.y - b.y)) - ((c.y - b.y) * (startP.x - b.x));
+	f32 signedAreaADeltaX = b.y - c.y;
+	f32 signedAreaADeltaY = c.x - b.x;
+
+	f32 signedAreaB       = ((a.x - c.x) * (startP.y - c.y)) - ((a.y - c.y) * (startP.x - c.x));
+	f32 signedAreaBDeltaX = c.y - a.y;
+	f32 signedAreaBDeltaY = a.x - c.x;
+
+	f32 signedAreaParallelogram = signedAreaC + signedAreaA + signedAreaB;
+	if (signedAreaParallelogram == 0) return;
+	f32 invSignedAreaParallelogram = 1.0f / signedAreaParallelogram;
+
+	__m128 invSignedAreaParallelogram_4x = _mm_set_ps1(invSignedAreaParallelogram);
+
+	// NOTE: Order is important here!
+	__m128 triangleZ             = _mm_set_ps(0, b.z, a.z, c.z);
+	__m128 signedAreaPixelDeltaX = _mm_set_ps(0, signedAreaBDeltaX, signedAreaADeltaX, signedAreaCDeltaX);
+	__m128 signedAreaPixelDeltaY = _mm_set_ps(0, signedAreaBDeltaY, signedAreaADeltaY, signedAreaCDeltaY);
+	__m128 signedAreaPixel1      = _mm_set_ps(0, signedAreaB, signedAreaA, signedAreaC);
+	__m128 signedAreaPixel2      = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaX);
+
+	const __m128 INV255_4X    = _mm_set_ps1(1.0f / 255.0f);
+	const __m128 ZERO_4X      = _mm_set_ps1(0.0f);
+	const __m128 TWO_4X       = _mm_set_ps1(2.0f);
+	const u32 IS_GREATER_MASK = 0xF;
+
+	// NOTE: Step size of 2 pixels across X
+	signedAreaPixelDeltaX = _mm_mul_ps(signedAreaPixelDeltaX, TWO_4X);
+
+	const DqnV2 uv2SubUv1      = uv2 - uv1;
+	const DqnV2 uv3SubUv1      = uv3 - uv1;
+	const __m128 colorModulate = _mm_set_ps(color.a, color.b, color.g, color.r);
+
+	for (i32 bufferY = min.y; bufferY < max.y; bufferY++)
+	{
+		__m128 signedArea1 = signedAreaPixel1;
+		__m128 signedArea2 = signedAreaPixel2;
+
+#define PROCESS_COLOR_NO_SIMD 0
+		for (i32 bufferX = min.x; bufferX < max.x; bufferX += 2)
+		{
+			__m128 isGreater1    = _mm_cmpge_ps(signedArea1, ZERO_4X);
+			i32 isGreaterResult1 = _mm_movemask_ps(isGreater1);
+			if ((isGreaterResult1 & IS_GREATER_MASK) == IS_GREATER_MASK)
+			{
+				__m128 barycentric  = _mm_mul_ps(signedArea1, invSignedAreaParallelogram_4x);
+				__m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric);
+
+				i32 zBufferIndex = bufferX + (bufferY * zBufferPitch);
+				f32 pixelZValue  = ((f32 *)&barycentricZ)[0] +
+				                   ((f32 *)&barycentricZ)[1] +
+				                   ((f32 *)&barycentricZ)[2];
+				f32 currZValue = renderBuffer->zBuffer[zBufferIndex];
+				if (pixelZValue > currZValue)
+				{
+					renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
+					u8 *texturePtr                      = texture->memory;
+					const u32 texturePitch              = texture->bytesPerPixel * texture->dim.w;
+
+					f32 barycentricB = ((f32 *)&barycentric)[2];
+					f32 barycentricC = ((f32 *)&barycentric)[0];
+					DqnV2 uv = uv1 + (uv2SubUv1 * barycentricB) + (uv3SubUv1 * barycentricC);
+
+					const f32 EPSILON = 0.1f;
+					DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON);
+					DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON);
+					uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f);
+					uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f);
+
+					f32 texelXf = uv.x * texture->dim.w;
+					f32 texelYf = uv.y * texture->dim.h;
+					DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w);
+					DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h);
+
+					i32 texelX = (i32)texelXf;
+					i32 texelY = (i32)texelYf;
+
+					u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) +
+					                      (texelY * texturePitch));
+
+#if PROCESS_COLOR_NO_SIMD
+					DqnV4 color1;
+					color1.a = (f32)(texel1 >> 24);
+					color1.b = (f32)((texel1 >> 16) & 0xFF);
+					color1.g = (f32)((texel1 >> 8) & 0xFF);
+					color1.r = (f32)((texel1 >> 0) & 0xFF);
+					color1 *= DTRRENDER_INV_255;
+					color1      = DTRRender_SRGB1ToLinearSpaceV4(color1);
+					DqnV4 blend = color * color1;
+#else
+					__m128 color1 =
+					    _mm_set_ps((f32)(texel1 >> 24), (f32)((texel1 >> 16) & 0xFF),
+					               (f32)((texel1 >> 8) & 0xFF), (f32)((texel1 >> 0) & 0xFF));
+					color1 = _mm_mul_ps(color1, INV255_4X);
+					color1 = _mm_mul_ps(color1, color1); // to linear space
+					color1 = _mm_mul_ps(color1, colorModulate);
+
+					DqnV4 blend = {};
+					blend.r     = ((f32 *)&color1)[0];
+					blend.g     = ((f32 *)&color1)[1];
+					blend.b     = ((f32 *)&color1)[2];
+					blend.a     = ((f32 *)&color1)[3];
+#endif
+					SetPixel(renderBuffer, bufferX, bufferY, blend, ColorSpace_Linear);
+				}
+			}
+
+			__m128 isGreater2    = _mm_cmpge_ps(signedArea2, ZERO_4X);
+			i32 isGreaterResult2 = _mm_movemask_ps(isGreater2);
+			i32 bufferX1         = bufferX + 1;
+			if ((isGreaterResult2 & IS_GREATER_MASK) == IS_GREATER_MASK && bufferX1 < max.x)
+			{
+				__m128 barycentric  = _mm_mul_ps(signedArea2, invSignedAreaParallelogram_4x);
+				__m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric);
+
+				i32 zBufferIndex = bufferX1 + (bufferY * zBufferPitch);
+				f32 pixelZValue  = ((f32 *)&barycentricZ)[0] + ((f32 *)&barycentricZ)[1] +
+				                  ((f32 *)&barycentricZ)[2];
+				f32 currZValue = renderBuffer->zBuffer[zBufferIndex];
+				if (pixelZValue > currZValue)
+				{
+					renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
+					u8 *texturePtr                      = texture->memory;
+					const u32 texturePitch              = texture->bytesPerPixel * texture->dim.w;
+
+					f32 barycentricB = ((f32 *)&barycentric)[2];
+					f32 barycentricC = ((f32 *)&barycentric)[0];
+					DqnV2 uv = uv1 + (uv2SubUv1 * barycentricB) + (uv3SubUv1 * barycentricC);
+
+					const f32 EPSILON = 0.1f;
+					DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON);
+					DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON);
+					uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f);
+					uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f);
+
+					f32 texelXf = uv.x * texture->dim.w;
+					f32 texelYf = uv.y * texture->dim.h;
+					DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w);
+					DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h);
+
+					i32 texelX = (i32)texelXf;
+					i32 texelY = (i32)texelYf;
+
+					u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) +
+					                      (texelY * texturePitch));
+
+#if PROCESS_COLOR_NO_SIMD
+					DqnV4 color1;
+					color1.a = (f32)(texel1 >> 24);
+					color1.b = (f32)((texel1 >> 16) & 0xFF);
+					color1.g = (f32)((texel1 >> 8) & 0xFF);
+					color1.r = (f32)((texel1 >> 0) & 0xFF);
+					color1 *= DTRRENDER_INV_255;
+					color1      = DTRRender_SRGB1ToLinearSpaceV4(color1);
+					DqnV4 blend = color * color1;
+#else
+					__m128 color1 =
+					    _mm_set_ps((f32)(texel1 >> 24), (f32)((texel1 >> 16) & 0xFF),
+					               (f32)((texel1 >> 8) & 0xFF), (f32)((texel1 >> 0) & 0xFF));
+					color1 = _mm_mul_ps(color1, INV255_4X);
+					color1 = _mm_mul_ps(color1, color1); // to linear space
+					color1 = _mm_mul_ps(color1, colorModulate);
+
+					DqnV4 blend = {};
+					blend.r     = ((f32 *)&color1)[0];
+					blend.g     = ((f32 *)&color1)[1];
+					blend.b     = ((f32 *)&color1)[2];
+					blend.a     = ((f32 *)&color1)[3];
+#endif
+					SetPixel(renderBuffer, bufferX1, bufferY, blend, ColorSpace_Linear);
+				}
+			}
+
+			signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX);
+			signedArea2 = _mm_add_ps(signedArea2, signedAreaPixelDeltaX);
+		}
+
+		signedAreaPixel1 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaY);
+		signedAreaPixel2 = _mm_add_ps(signedAreaPixel2, signedAreaPixelDeltaY);
+	}
+}
+
+void DTRRender_TexturedTriangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3,
                                DqnV2 uv1, DqnV2 uv2, DqnV2 uv3, DTRBitmap *const texture,
                                DqnV4 color, const DTRRenderTransform transform)
 {
@ -579,18 +922,6 @@ void DTRRender_TexturedTriangle(PlatformInput *const input,
 	color = DTRRender_SRGB1ToLinearSpaceV4(color);
 	color = PreMultiplyAlpha1(color);

-	////////////////////////////////////////////////////////////////////////////
-	// Calculate Bounding Box
-	////////////////////////////////////////////////////////////////////////////
-	DqnV2i max = DqnV2i_2f(DQN_MAX(DQN_MAX(p1.x, p2.x), p3.x),
-	                       DQN_MAX(DQN_MAX(p1.y, p2.y), p3.y));
-	DqnV2i min = DqnV2i_2f(DQN_MIN(DQN_MIN(p1.x, p2.x), p3.x),
-	                       DQN_MIN(DQN_MIN(p1.y, p2.y), p3.y));
-	min.x = DQN_MAX(min.x, 0);
-	min.y = DQN_MAX(min.y, 0);
-	max.x = DQN_MIN(max.x, renderBuffer->width - 1);
-	max.y = DQN_MIN(max.y, renderBuffer->height - 1);
-
 	f32 area2Times = ((p2.x - p1.x) * (p2.y + p1.y)) +
 	                 ((p3.x - p2.x) * (p3.y + p2.y)) +
 	                 ((p1.x - p3.x) * (p1.y + p3.y));
@ -600,329 +931,17 @@ void DTRRender_TexturedTriangle(PlatformInput *const input,
 		DQN_SWAP(DqnV3, p2, p3);
 	}

-	////////////////////////////////////////////////////////////////////////////
-	// Signed Area - See Render_Triangle for explanation
-	////////////////////////////////////////////////////////////////////////////
-	const DqnV3 a = p1;
-	const DqnV3 b = p2;
-	const DqnV3 c = p3;
-
 	DTRDebug_BeginCycleCount(DTRDebugCycleCount_RenderTriangle_Rasterise);
 	////////////////////////////////////////////////////////////////////////////
 	// Scan and Render
 	////////////////////////////////////////////////////////////////////////////
-	const u32 zBufferPitch = renderBuffer->width;
-	if (input->canUseSSE2)
+	if (globalDTRPlatformFlags.canUseSSE2)
 	{
-		DqnV2i startP                   = min;
-		f32 edge1SignedAreaPixel1       = ((b.x - a.x) * (startP.y - a.y)) - ((b.y - a.y) * (startP.x - a.x));
-		f32 edge1SignedAreaPixel1DeltaX = a.y - b.y;
-		f32 edge1SignedAreaPixel1DeltaY = b.x - a.x;
-
-		f32 edge2SignedAreaPixel1       = ((c.x - b.x) * (startP.y - b.y)) - ((c.y - b.y) * (startP.x - b.x));
-		f32 edge2SignedAreaPixel1DeltaX = b.y - c.y;
-		f32 edge2SignedAreaPixel1DeltaY = c.x - b.x;
-
-		f32 edge3SignedAreaPixel1       = ((a.x - c.x) * (startP.y - c.y)) - ((a.y - c.y) * (startP.x - c.x));
-		f32 edge3SignedAreaPixel1DeltaX = c.y - a.y;
-		f32 edge3SignedAreaPixel1DeltaY = a.x - c.x;
-
-		f32 signedAreaParallelogramPixel1 = edge1SignedAreaPixel1 + edge2SignedAreaPixel1 + edge3SignedAreaPixel1;
-		if (signedAreaParallelogramPixel1 == 0) return;
-		f32 invSignedAreaParallelogramPixel1 = 1 / signedAreaParallelogramPixel1;
-
-		__m128 inv255_4x                    = _mm_set_ps1(DTRRENDER_INV_255);
-		__m128 zero_4x                      = _mm_set_ps1(0.0f);
-		__m128 two_4x                       = _mm_set_ps1(2.0f);
-		__m128 invSignedAreaParallelogram4x = _mm_set_ps1(invSignedAreaParallelogramPixel1);
-		__m128 triangleZ                    = _mm_set_ps(0, b.z, a.z, c.z);
-
-		__m128 signedAreaPixelDeltaX = _mm_set_ps(0, edge3SignedAreaPixel1DeltaX, edge2SignedAreaPixel1DeltaX, edge1SignedAreaPixel1DeltaX);
-		__m128 signedAreaPixelDeltaY = _mm_set_ps(0, edge3SignedAreaPixel1DeltaY, edge2SignedAreaPixel1DeltaY, edge1SignedAreaPixel1DeltaY);
-
-		__m128 signedAreaPixel1 = _mm_set_ps(0, edge3SignedAreaPixel1, edge2SignedAreaPixel1, edge1SignedAreaPixel1);
-		__m128 signedAreaPixel2 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaX);
-
-		// NOTE: Step size of 2 pixels across X
-		signedAreaPixelDeltaX = _mm_mul_ps(signedAreaPixelDeltaX, two_4x);
-
-		const DqnV2 uv2SubUv1      = uv2 - uv1;
-		const DqnV2 uv3SubUv1      = uv3 - uv1;
-		const __m128 colorModulate = _mm_set_ps(color.a, color.b, color.g, color.r);
-
-		const u32 IS_GREATER_MASK = 0xF;
-
-		for (i32 bufferY = min.y; bufferY < max.y; bufferY++)
-		{
-			__m128 signedArea1 = signedAreaPixel1;
-			__m128 signedArea2 = signedAreaPixel2;
-
-#define PROCESS_COLOR_NO_SIMD 0
-			for (i32 bufferX = min.x; bufferX < max.x; bufferX += 2)
-			{
-				__m128 isGreater1    = _mm_cmpge_ps(signedArea1, zero_4x);
-				i32 isGreaterResult1 = _mm_movemask_ps(isGreater1);
-				if ((isGreaterResult1 & IS_GREATER_MASK) == IS_GREATER_MASK)
-				{
-					__m128 barycentric  = _mm_mul_ps(signedArea1, invSignedAreaParallelogram4x);
-					__m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric);
-
-					i32 zBufferIndex = bufferX + (bufferY * zBufferPitch);
-					f32 pixelZValue = ((f32 *)&barycentricZ)[0] +
-					                  ((f32 *)&barycentricZ)[1] +
-					                  ((f32 *)&barycentricZ)[2];
-					f32 currZValue = renderBuffer->zBuffer[zBufferIndex];
-					if (pixelZValue > currZValue)
-					{
-						renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
-						u8 *texturePtr                      = texture->memory;
-						const u32 texturePitch = texture->bytesPerPixel * texture->dim.w;
-
-						f32 barycentricB = ((f32 *)&barycentric)[2];
-						f32 barycentricC = ((f32 *)&barycentric)[0];
-						DqnV2 uv = uv1 + (uv2SubUv1 * barycentricB) + (uv3SubUv1 * barycentricC);
-
-						const f32 EPSILON = 0.1f;
-						DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON);
-						DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON);
-						uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f);
-						uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f);
-
-						f32 texelXf = uv.x * texture->dim.w;
-						f32 texelYf = uv.y * texture->dim.h;
-						DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w);
-						DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h);
-
-						i32 texelX = (i32)texelXf;
-						i32 texelY = (i32)texelYf;
-
-						u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) +
-						                      (texelY * texturePitch));
-
-#if PROCESS_COLOR_NO_SIMD
-						DqnV4 color1;
-						color1.a = (f32)(texel1 >> 24);
-						color1.b = (f32)((texel1 >> 16) & 0xFF);
-						color1.g = (f32)((texel1 >> 8) & 0xFF);
-						color1.r = (f32)((texel1 >> 0) & 0xFF);
-						color1 *= DTRRENDER_INV_255;
-						color1      = DTRRender_SRGB1ToLinearSpaceV4(color1);
-						DqnV4 blend = color * color1;
-#else
-						__m128 color1 = _mm_set_ps((f32)(texel1 >> 24),
-						                           (f32)((texel1 >> 16) & 0xFF),
-						                           (f32)((texel1 >> 8) & 0xFF),
-						                           (f32)((texel1 >> 0) & 0xFF));
-						color1 = _mm_mul_ps(color1, inv255_4x);
-						color1 = _mm_mul_ps(color1, color1); // to linear space
-						color1 = _mm_mul_ps(color1, colorModulate);
-
-						DqnV4 blend = {};
-						blend.r     = ((f32 *)&color1)[0];
-						blend.g     = ((f32 *)&color1)[1];
-						blend.b     = ((f32 *)&color1)[2];
-						blend.a     = ((f32 *)&color1)[3];
-#endif
-						SetPixel(renderBuffer, bufferX, bufferY, blend, ColorSpace_Linear);
-					}
-
-				}
-
-				__m128 isGreater2    = _mm_cmpge_ps(signedArea2, zero_4x);
-				i32 isGreaterResult2 = _mm_movemask_ps(isGreater2);
-				i32 bufferX1         = bufferX + 1;
-				if ((isGreaterResult2 & IS_GREATER_MASK) == IS_GREATER_MASK && bufferX1 < max.x)
-				{
-					__m128 barycentric  = _mm_mul_ps(signedArea2, invSignedAreaParallelogram4x);
-					__m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric);
-
-					i32 zBufferIndex = bufferX1 + (bufferY * zBufferPitch);
-					f32 pixelZValue  = ((f32 *)&barycentricZ)[0] +
-					                   ((f32 *)&barycentricZ)[1] +
-					                   ((f32 *)&barycentricZ)[2];
-					f32 currZValue = renderBuffer->zBuffer[zBufferIndex];
-					if (pixelZValue > currZValue)
-					{
-						renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
-						u8 *texturePtr                      = texture->memory;
-						const u32 texturePitch = texture->bytesPerPixel * texture->dim.w;
-
-						f32 barycentricB = ((f32 *)&barycentric)[2];
-						f32 barycentricC = ((f32 *)&barycentric)[0];
-						DqnV2 uv = uv1 + (uv2SubUv1 * barycentricB) + (uv3SubUv1 * barycentricC);
-
-						const f32 EPSILON = 0.1f;
-						DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON);
-						DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON);
-						uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f);
-						uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f);
-
-						f32 texelXf = uv.x * texture->dim.w;
-						f32 texelYf = uv.y * texture->dim.h;
-						DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w);
-						DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h);
-
-						i32 texelX = (i32)texelXf;
-						i32 texelY = (i32)texelYf;
-
-						u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) +
-						                      (texelY * texturePitch));
-
-#if PROCESS_COLOR_NO_SIMD
-						DqnV4 color1;
-						color1.a = (f32)(texel1 >> 24);
-						color1.b = (f32)((texel1 >> 16) & 0xFF);
-						color1.g = (f32)((texel1 >> 8) & 0xFF);
-						color1.r = (f32)((texel1 >> 0) & 0xFF);
-						color1 *= DTRRENDER_INV_255;
-						color1      = DTRRender_SRGB1ToLinearSpaceV4(color1);
-						DqnV4 blend = color * color1;
-#else
-						__m128 color1 = _mm_set_ps((f32)(texel1 >> 24),
-						                           (f32)((texel1 >> 16) & 0xFF),
-						                           (f32)((texel1 >> 8) & 0xFF),
-						                           (f32)((texel1 >> 0) & 0xFF));
-						color1 = _mm_mul_ps(color1, inv255_4x);
-						color1 = _mm_mul_ps(color1, color1); // to linear space
-						color1 = _mm_mul_ps(color1, colorModulate);
-
-						DqnV4 blend = {};
-						blend.r     = ((f32 *)&color1)[0];
-						blend.g     = ((f32 *)&color1)[1];
-						blend.b     = ((f32 *)&color1)[2];
-						blend.a     = ((f32 *)&color1)[3];
-#endif
-						SetPixel(renderBuffer, bufferX1, bufferY, blend, ColorSpace_Linear);
-					}
-				}
-
-				signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX);
-				signedArea2 = _mm_add_ps(signedArea2, signedAreaPixelDeltaX);
-			}
-
-			signedAreaPixel1 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaY);
-			signedAreaPixel2 = _mm_add_ps(signedAreaPixel2, signedAreaPixelDeltaY);
-		}
+		SIMDRasteriseTexturedTriangle(renderBuffer, p1, p2, p3, uv1, uv2, uv3, texture, color);
 	}
 	else
 	{
-		DqnV2i startP         = min;
-		f32 signedArea1       = ((b.x - a.x) * (startP.y - a.y)) - ((b.y - a.y) * (startP.x - a.x));
-		f32 signedArea1DeltaX = a.y - b.y;
-		f32 signedArea1DeltaY = b.x - a.x;
-
-		f32 signedArea2       = ((c.x - b.x) * (startP.y - b.y)) - ((c.y - b.y) * (startP.x - b.x));
-		f32 signedArea2DeltaX = b.y - c.y;
-		f32 signedArea2DeltaY = c.x - b.x;
-
-		f32 signedArea3       = ((a.x - c.x) * (startP.y - c.y)) - ((a.y - c.y) * (startP.x - c.x));
-		f32 signedArea3DeltaX = c.y - a.y;
-		f32 signedArea3DeltaY = a.x - c.x;
-
-		f32 signedAreaParallelogram = signedArea1 + signedArea2 + signedArea3;
-		if (signedAreaParallelogram == 0) return;
-		f32 invSignedAreaParallelogram = 1 / signedAreaParallelogram;
-
-		for (i32 bufferY = min.y; bufferY < max.y; bufferY++)
-		{
-			f32 signedArea1Row = signedArea1;
-			f32 signedArea2Row = signedArea2;
-			f32 signedArea3Row = signedArea3;
-
-			for (i32 bufferX = min.x; bufferX < max.x; bufferX++)
-			{
-				if (signedArea1Row >= 0 && signedArea2Row >= 0 && signedArea3Row >= 0)
-				{
-					f32 barycentricB = signedArea3Row * invSignedAreaParallelogram;
-					f32 barycentricC = signedArea1Row * invSignedAreaParallelogram;
-
-					if (DTR_DEBUG)
-					{
-						const f32 EPSILON = 0.1f;
-
-						f32 debugSignedArea1 = ((b.x - a.x) * (bufferY - a.y)) - ((b.y - a.y) * (bufferX - a.x));
-						f32 debugSignedArea2 = ((c.x - b.x) * (bufferY - b.y)) - ((c.y - b.y) * (bufferX - b.x));
-						f32 debugSignedArea3 = ((a.x - c.x) * (bufferY - c.y)) - ((a.y - c.y) * (bufferX - c.x));
-
-						f32 deltaSignedArea1 = DQN_ABS(debugSignedArea1 - signedArea1Row);
-						f32 deltaSignedArea2 = DQN_ABS(debugSignedArea2 - signedArea2Row);
-						f32 deltaSignedArea3 = DQN_ABS(debugSignedArea3 - signedArea3Row);
-						DQN_ASSERT(deltaSignedArea1 < EPSILON && deltaSignedArea2 < EPSILON &&
-						           deltaSignedArea3 < EPSILON)
-
-						f32 debugBarycentricA, debugBarycentricB, debugBarycentricC;
-						DebugBarycentricInternal(DqnV2_2i(bufferX, bufferY), a.xy, b.xy, c.xy,
-						                         &debugBarycentricA, &debugBarycentricB,
-						                         &debugBarycentricC);
-
-						f32 deltaBaryB = DQN_ABS(barycentricB - debugBarycentricB);
-						f32 deltaBaryC = DQN_ABS(barycentricC - debugBarycentricC);
-
-						DQN_ASSERT(deltaBaryB < EPSILON && deltaBaryC < EPSILON)
-					}
-
-					i32 zBufferIndex = bufferX + (bufferY * zBufferPitch);
-					f32 pixelZValue =
-					    a.z + (barycentricB * (b.z - a.z)) + (barycentricC * (c.z - a.z));
-					f32 currZValue = renderBuffer->zBuffer[zBufferIndex];
-					DQN_ASSERT(zBufferIndex < (renderBuffer->width * renderBuffer->height));
-
-					if (pixelZValue > currZValue)
-					{
-						renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
-						if (texture)
-						{
-							u8 *texturePtr         = texture->memory;
-							const u32 texturePitch = texture->bytesPerPixel * texture->dim.w;
-
-							DqnV2 uv =
-							    uv1 + ((uv2 - uv1) * barycentricB) + ((uv3 - uv1) * barycentricC);
-
-							const f32 EPSILON = 0.1f;
-							DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON);
-							DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON);
-
-							uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f);
-							uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f);
-
-							f32 texelXf = uv.x * texture->dim.w;
-							f32 texelYf = uv.y * texture->dim.h;
-							DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w);
-							DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h);
-
-							i32 texelX = (i32)texelXf;
-							i32 texelY = (i32)texelYf;
-
-							u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) +
-							                      (texelY * texturePitch));
-
-							DqnV4 color1;
-							color1.a = (f32)(texel1 >> 24);
-							color1.b = (f32)((texel1 >> 16) & 0xFF);
-							color1.g = (f32)((texel1 >> 8) & 0xFF);
-							color1.r = (f32)((texel1 >> 0) & 0xFF);
-
-							color1 *= DTRRENDER_INV_255;
-							color1      = DTRRender_SRGB1ToLinearSpaceV4(color1);
-							DqnV4 blend = color * color1;
-							SetPixel(renderBuffer, bufferX, bufferY, blend, ColorSpace_Linear);
-						}
-						else
-						{
-							SetPixel(renderBuffer, bufferX, bufferY, color, ColorSpace_Linear);
-						}
-					}
-				}
-
-				signedArea1Row += signedArea1DeltaX;
-				signedArea2Row += signedArea2DeltaX;
-				signedArea3Row += signedArea3DeltaX;
-			}
-
-			signedArea1 += signedArea1DeltaY;
-			signedArea2 += signedArea2DeltaY;
-			signedArea3 += signedArea3DeltaY;
-		}
+		RasteriseTexturedTriangle(renderBuffer, p1, p2, p3, uv1, uv2, uv3, texture, color);
 	}
 	DTRDebug_EndCycleCount(DTRDebugCycleCount_RenderTriangle_Rasterise);

@ -932,6 +951,15 @@ void DTRRender_TexturedTriangle(PlatformInput *const input,
 	DTRDebug_CounterIncrement(DTRDebugCounter_RenderTriangle);
 	if (DTR_DEBUG_RENDER)
 	{
+		DqnV2i max =
+		    DqnV2i_2f(DQN_MAX(DQN_MAX(p1.x, p2.x), p3.x), DQN_MAX(DQN_MAX(p1.y, p2.y), p3.y));
+		DqnV2i min =
+		    DqnV2i_2f(DQN_MIN(DQN_MIN(p1.x, p2.x), p3.x), DQN_MIN(DQN_MIN(p1.y, p2.y), p3.y));
+		min.x = DQN_MAX(min.x, 0);
+		min.y = DQN_MAX(min.y, 0);
+		max.x = DQN_MIN(max.x, renderBuffer->width - 1);
+		max.y = DQN_MIN(max.y, renderBuffer->height - 1);
+
 		// Draw Bounding box
 		if (0)
 		{
@ -966,6 +994,87 @@ void DTRRender_TexturedTriangle(PlatformInput *const input,
 	}
 }

+void DTRRender_Mesh(DTRRenderBuffer *const renderBuffer, DTRMesh *const mesh, const DqnV3 pos,
+                    const f32 scale, const DqnV3 lightVector)
+{
+	if (!mesh) return;
+
+	for (u32 i = 0; i < mesh->numFaces; i++)
+	{
+		DTRMeshFace face = mesh->faces[i];
+		DQN_ASSERT(face.numVertexIndex == 3);
+		i32 vertAIndex = face.vertexIndex[0];
+		i32 vertBIndex = face.vertexIndex[1];
+		i32 vertCIndex = face.vertexIndex[2];
+
+		DqnV4 vertA = mesh->vertexes[vertAIndex];
+		DqnV4 vertB = mesh->vertexes[vertBIndex];
+		DqnV4 vertC = mesh->vertexes[vertCIndex];
+		// TODO(doyle): Some models have -ve indexes to refer to relative
+		// vertices. We should resolve that to positive indexes at run time.
+		DQN_ASSERT(vertAIndex < (i32)mesh->numVertexes);
+		DQN_ASSERT(vertBIndex < (i32)mesh->numVertexes);
+		DQN_ASSERT(vertCIndex < (i32)mesh->numVertexes);
+
+		DqnV4 vertAB = vertB - vertA;
+		DqnV4 vertAC = vertC - vertA;
+		DqnV3 normal = DqnV3_Cross(vertAC.xyz, vertAB.xyz);
+
+		f32 intensity = DqnV3_Dot(DqnV3_Normalise(normal), lightVector);
+		if (intensity < 0) continue;
+		DqnV4 modelCol = DqnV4_4f(1, 1, 1, 1);
+		modelCol.rgb *= DQN_ABS(intensity);
+
+		DqnV3 screenVA = (vertA.xyz * scale) + pos;
+		DqnV3 screenVB = (vertB.xyz * scale) + pos;
+		DqnV3 screenVC = (vertC.xyz * scale) + pos;
+
+		// TODO(doyle): Why do we need rounding here? Maybe it's because
+		// I don't do any interpolation in the triangle routine for jagged
+		// edges.
+		screenVA.x = (f32)(i32)(screenVA.x + 0.5f);
+		screenVA.y = (f32)(i32)(screenVA.y + 0.5f);
+		screenVB.x = (f32)(i32)(screenVB.x + 0.5f);
+		screenVB.y = (f32)(i32)(screenVB.y + 0.5f);
+		screenVC.x = (f32)(i32)(screenVC.x + 0.5f);
+		screenVC.y = (f32)(i32)(screenVC.y + 0.5f);
+
+		i32 textureAIndex = face.texIndex[0];
+		i32 textureBIndex = face.texIndex[1];
+		i32 textureCIndex = face.texIndex[2];
+
+		DqnV2 texA = mesh->texUV[textureAIndex].xy;
+		DqnV2 texB = mesh->texUV[textureBIndex].xy;
+		DqnV2 texC = mesh->texUV[textureCIndex].xy;
+		DQN_ASSERT(textureAIndex < (i32)mesh->numTexUV);
+		DQN_ASSERT(textureBIndex < (i32)mesh->numTexUV);
+		DQN_ASSERT(textureCIndex < (i32)mesh->numTexUV);
+
+		bool DEBUG_SIMPLE_MODE = false;
+		if (DTR_DEBUG && DEBUG_SIMPLE_MODE)
+		{
+			DTRRender_Triangle(renderBuffer, screenVA, screenVB, screenVC, modelCol);
+		}
+		else
+		{
+			DTRRender_TexturedTriangle(renderBuffer, screenVA, screenVB, screenVC, texA, texB,
+			                           texC, &mesh->tex, modelCol);
+		}
+
+		bool DEBUG_WIREFRAME = false;
+		if (DTR_DEBUG && DEBUG_WIREFRAME)
+		{
+			DqnV4 wireColor = DqnV4_4f(1.0f, 1.0f, 1.0f, 0.01f);
+			DTRRender_Line(renderBuffer, DqnV2i_V2(screenVA.xy), DqnV2i_V2(screenVB.xy),
+			               wireColor);
+			DTRRender_Line(renderBuffer, DqnV2i_V2(screenVB.xy), DqnV2i_V2(screenVC.xy),
+			               wireColor);
+			DTRRender_Line(renderBuffer, DqnV2i_V2(screenVC.xy), DqnV2i_V2(screenVA.xy),
+			               wireColor);
+		}
+	}
+}
+
 void DTRRender_Triangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3,
                        DqnV4 color, const DTRRenderTransform transform)
 {
--- a/src/DTRendererRender.h
+++ b/src/DTRendererRender.h
@ -62,8 +62,9 @@ inline DqnV4 DTRRender_PreMultiplyAlphaSRGB1WithLinearConversion(DqnV4 color);
 void DTRRender_Text            (DTRRenderBuffer *const renderBuffer, const DTRFont font, DqnV2 pos, const char *const text, DqnV4 color = DqnV4_1f(1), i32 len = -1);
 void DTRRender_Line            (DTRRenderBuffer *const renderBuffer, DqnV2i a, DqnV2i b, DqnV4 color);
 void DTRRender_Rectangle       (DTRRenderBuffer *const renderBuffer, DqnV2 min, DqnV2 max, DqnV4 color, const DTRRenderTransform transform = DTRRender_DefaultTransform());
+void DTRRender_Mesh            (DTRRenderBuffer *const renderBuffer, DTRMesh *const mesh, const DqnV3 pos, const f32 scale, const DqnV3 lightVector);
 void DTRRender_Triangle        (DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3, DqnV4 color, const DTRRenderTransform transform = DTRRender_DefaultTriangleTransform());
-void DTRRender_TexturedTriangle(PlatformInput *const input, DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3, DqnV2 uv1, DqnV2 uv2, DqnV2 uv3, DTRBitmap *const texture, DqnV4 color, const DTRRenderTransform transform = DTRRender_DefaultTriangleTransform());
+void DTRRender_TexturedTriangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3, DqnV2 uv1, DqnV2 uv2, DqnV2 uv3, DTRBitmap *const texture, DqnV4 color, const DTRRenderTransform transform = DTRRender_DefaultTriangleTransform());
 void DTRRender_Bitmap          (DTRRenderBuffer *const renderBuffer, DTRBitmap *const bitmap, DqnV2 pos, const DTRRenderTransform transform = DTRRender_DefaultTransform(), DqnV4 color = DqnV4_4f(1, 1, 1, 1));
 void DTRRender_Clear           (DTRRenderBuffer *const renderBuffer, DqnV3 color);

--- a/src/Win32DTRenderer.cpp
+++ b/src/Win32DTRenderer.cpp
@ -273,23 +273,21 @@ FILE_SCOPE void Win32HandleMenuMessages(HWND window, MSG msg,

 		case Win32Menu_FileFlushMemory:
 		{
+			DqnMemStack memStacks[DQN_ARRAY_COUNT(globalPlatformMemory.stacks)] = {};
 			for (i32 i = 0; i < DQN_ARRAY_COUNT(globalPlatformMemory.stacks); i++)
 			{
-				while (globalPlatformMemory.stacks[i].block->prevBlock)
-					DqnMemStack_FreeLastBlock(&globalPlatformMemory.stacks[i]);
+				DqnMemStack *stack = &globalPlatformMemory.stacks[i];
+				while (stack->block->prevBlock)
+					DqnMemStack_FreeLastBlock(stack);

-				DqnMemStack_ClearCurrBlock(&globalPlatformMemory.stacks[i], true);
+				DqnMemStack_ClearCurrBlock(stack, true);
+				memStacks[i] = *stack;
 			}

-			DqnMemStack mainStack  = globalPlatformMemory.mainStack;
-			DqnMemStack assetStack = globalPlatformMemory.assetStack;
-			DqnMemStack tempStack  = globalPlatformMemory.tempStack;
-
-			PlatformMemory empty            = {};
-			globalPlatformMemory            = empty;
-			globalPlatformMemory.mainStack  = mainStack;
-			globalPlatformMemory.assetStack = assetStack;
-			globalPlatformMemory.tempStack  = tempStack;
+			PlatformMemory empty = {};
+			globalPlatformMemory = empty;
+			for (i32 i = 0; i < DQN_ARRAY_COUNT(globalPlatformMemory.stacks); i++)
+				globalPlatformMemory.stacks[i] = memStacks[i];
 		}
 		break;

@ -560,10 +558,10 @@ int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance,
 	platformAPI.FileClose   = Platform_FileClose;
 	platformAPI.Print       = Platform_Print;

-	PlatformInput platformInput = {};
-	platformInput.canUseSSE2    = IsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE);
-	platformInput.canUseRdtsc   = IsProcessorFeaturePresent(PF_RDTSC_INSTRUCTION_AVAILABLE);
-	platformInput.api           = platformAPI;
+	PlatformInput platformInput     = {};
+	platformInput.api               = platformAPI;
+	platformInput.flags.canUseSSE2  = IsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE);
+	platformInput.flags.canUseRdtsc = IsProcessorFeaturePresent(PF_RDTSC_INSTRUCTION_AVAILABLE);

 	////////////////////////////////////////////////////////////////////////////
 	// Update Loop
@ -585,7 +583,7 @@ int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance,
 		{
 			Win32UnloadExternalDLL(&dllCode);
 			dllCode = Win32LoadExternalDLL(dllPath, dllTmpPath, lastWriteTime);
-			platformInput.executableReloaded = true;
+			platformInput.flags.executableReloaded = true;
 		}

 		{