Accelerate textured triangle rendering using SIMD

2017-05-30 17:41:05 +10:00 · 2017-05-30 17:41:05 +10:00 · 49270a2826
commit 49270a2826
parent 4d2a7a7c06
7 changed files with 324 additions and 132 deletions
--- a/src/DTRenderer.cpp
+++ b/src/DTRenderer.cpp
@ -1104,8 +1104,8 @@ extern "C" void DTR_Update(PlatformRenderBuffer *const platformRenderBuffer,
 			}
 			else
 			{
-				DTRRender_TexturedTriangle(&renderBuffer, screenVA, screenVB, screenVC, texA, texB,
-				                           texC, &state->mesh.tex, modelCol);
+				DTRRender_TexturedTriangle(input, &renderBuffer, screenVA, screenVB, screenVC, texA,
+				                           texB, texC, &state->mesh.tex, modelCol);
 			}

 			bool DEBUG_WIREFRAME = false;
--- a/src/DTRendererAsset.cpp
+++ b/src/DTRendererAsset.cpp
@ -17,14 +17,23 @@ void DTRAsset_InitGlobalState()
 	stbi_set_flip_vertically_on_load(true);
 }

-FILE_SCOPE void MemcopyInternal(u8 *dest, u8 *src, size_t numBytes)
+FILE_SCOPE void MemcopyInternal(u8 *const dest, u8 *const src, size_t numBytes)
 {
 	if (!dest || !src || numBytes == 0) return;
 	for (size_t i = 0; i < numBytes; i++)
 		dest[i] = src[i];
 }

-FILE_SCOPE void AssetDqnArrayMemAPICallback(DqnMemAPICallbackInfo info, DqnMemAPICallbackResult *result)
+// NOTE: Dynamic array allocations just requests space at the first option it
+// can take. Realloc will reallocate in place if there's space. Otherwise
+// it'll create a new block and reallocate there by copying the old data over.
+
+// So this does waste space. But is a quick way to reroute allocations into
+// a MemStack. It's main intended purpose is for one-shot loading data that you
+// don't know how much space you need in your DArray. After filling out
+// the dynamic array you then compact the data manually using memcopys into
+// a new block and discard the old data.
+FILE_SCOPE void DumbDynamicArrayMemAPICallback(DqnMemAPICallbackInfo info, DqnMemAPICallbackResult *result)
 {
 	DQN_ASSERT(info.type != DqnMemAPICallbackType_Invalid);
 	DqnMemStack *stack = static_cast<DqnMemStack *>(info.userContext);
@ -40,27 +49,7 @@ FILE_SCOPE void AssetDqnArrayMemAPICallback(DqnMemAPICallbackInfo info, DqnMemAP

 		case DqnMemAPICallbackType_Free:
 		{
-			DqnMemStackBlock **blockPtr = &stack->block;
-			while (*blockPtr && (*blockPtr)->memory != info.ptrToFree)
-			{
-				// NOTE(doyle): Ensure that the base ptr of each block is always
-				// actually aligned so we don't ever miss finding the block if
-				// the allocator had to realign the pointer from the base
-				// address.
-				if (DTR_DEBUG)
-				{
-					size_t memBaseAddr = (size_t)((*blockPtr)->memory);
-					DQN_ASSERT(DQN_ALIGN_POW_N(memBaseAddr, stack->byteAlign) ==
-					           memBaseAddr);
-				}
-				blockPtr = &((*blockPtr)->prevBlock);
-			}
-
-			DQN_ASSERT(*blockPtr && (*blockPtr)->memory == info.ptrToFree);
-			DqnMemStackBlock *blockToFree = *blockPtr;
-			*blockPtr                     = blockToFree->prevBlock;
-			DqnMem_Free(blockToFree);
-
+			DQN_ASSERT(DQN_INVALID_CODE_PATH);
 		}
 		break;

@ -189,7 +178,7 @@ bool DTRAsset_LoadWavefrontObj(const PlatformAPI api, DqnMemStack *const memStac
 	size_t fileSize                = file.size;

 	DqnMemAPI memAPI   = {};
-	memAPI.callback    = AssetDqnArrayMemAPICallback;
+	memAPI.callback    = DumbDynamicArrayMemAPICallback;
 	memAPI.userContext = memStack;

 	enum WavefVertexType {
@ -199,6 +188,23 @@ bool DTRAsset_LoadWavefrontObj(const PlatformAPI api, DqnMemStack *const memStac
 		WavefVertexType_Normal,
 	};

+	// TODO(doyle): We should profile, reading it out to WavefModel format and
+	// then copying it over, versus just reading the file twice. First pass is
+	// to count the number of vertexes etc. for each section we need. Then the
+	// second pass we can allocate directly the number we need and reparse it.
+	// I have a feeling that, in general that's a better idea, atleast it gets
+	// rid of alot of stupid copying code and memstack juggling.
+
+	// NOTE(doyle): We pre-process the data into an intermediate format that
+	// more accurately represents the file format. Since there's no metadata
+	// inside Wavefront objects, we don't know how many vertexes/texUV/normals
+	// there are- which makes it hard to allocate "nicely" out of our memory
+	// stack.
+
+	// So we preprocess. Then once we know the final amount, copy over the data
+	// to a new memstack block such that all the data is compacted together in
+	// memory for locality. Then just throw away the intermediate
+	// representation.
 	WavefModel dummy_ = {};
 	WavefModel *obj   = &dummy_;

@ -410,11 +416,17 @@ bool DTRAsset_LoadWavefrontObj(const PlatformAPI api, DqnMemStack *const memStac
 					DQN_ASSERT(obj->groupNameIndex + 1 < DQN_ARRAY_COUNT(obj->groupName));

 					DQN_ASSERT(!obj->groupName[obj->groupNameIndex]);
+					// TODO(doyle): Broken since I don't "copy" it over to our
+					// final DTRMesh. Below I copy over the data so that all the
+					// allocations are compacted together but don't copy this
+					// yet. Which means the name gets trashed atm.
+#if 0
 					obj->groupName[obj->groupNameIndex++] =
 					    (char *)DqnMemStack_Push(memStack, (nameLen + 1) * sizeof(char));

 					for (i32 i = 0; i < nameLen; i++)
 						obj->groupName[obj->groupNameIndex - 1][i] = namePtr[i];
+#endif

 					while (scan && (*scan == ' ' || *scan == '\n'))
 						scan++;
--- a/src/DTRendererDebug.cpp
+++ b/src/DTRendererDebug.cpp
@ -121,7 +121,9 @@ void inline DTRDebug_BeginCycleCount(enum DTRDebugCycleCount tag)
 	{
 		if (globalDebug.input && globalDebug.input->canUseRdtsc)
 		{
-			globalDebug.cycleCount[tag] = __rdtsc();
+			DTRDebugCycles *const cycles = &globalDebug.cycles[tag];
+			cycles->tmpStartCycles       = __rdtsc();
+			cycles->numInvokes++;
 		}
 	}
 }
@ -132,7 +134,8 @@ void inline DTRDebug_EndCycleCount(enum DTRDebugCycleCount tag)
 	{
 		if (globalDebug.input && globalDebug.input->canUseRdtsc)
 		{
-			globalDebug.cycleCount[tag] = __rdtsc() - globalDebug.cycleCount[tag];
+			DTRDebugCycles *const cycles = &globalDebug.cycles[tag];
+			cycles->totalCycles += __rdtsc() - cycles->tmpStartCycles;
 		}
 	}
 }
@ -214,9 +217,16 @@ void DTRDebug_Update(DTRState *const state,
 		DTRDebug_PushText("TrianglesRendered: %'lld", debug->counter[DTRDebugCounter_RenderTriangle]);
 		DTRDebug_PushText("");

-		for (i32 i = 0; i < DQN_ARRAY_COUNT(debug->cycleCount); i++)
+		DTRDebugCycles emptyDebugCycles = {};
+		for (i32 i = 0; i < DQN_ARRAY_COUNT(debug->cycles); i++)
 		{
-			DTRDebug_PushText("%d: %'lld cycles", i, debug->cycleCount[i]);
+			DTRDebugCycles *const cycles = &globalDebug.cycles[i];
+
+			u64 invocations = (cycles->numInvokes == 0) ? 1 : cycles->numInvokes;
+			u64 avgCycles   = cycles->totalCycles / invocations;
+			DTRDebug_PushText("%d: %'lld avg cycles", i, avgCycles);
+
+			*cycles = emptyDebugCycles;
 		}
 		DTRDebug_PushText("");

--- a/src/DTRendererDebug.h
+++ b/src/DTRendererDebug.h
@ -47,6 +47,14 @@ enum DTRDebugCycleCount
 	DTRDebugCycleCount_Count,
 };

+typedef struct DTRDebugCycles
+{
+	u64 totalCycles;
+	u64 numInvokes;
+
+	u64 tmpStartCycles; // Used to calculate the number of cycles elapsed
+} DTRDebugCycles;
+
 typedef struct DTRDebug
 {
 	struct DTRFont         *font;
@ -57,9 +65,9 @@ typedef struct DTRDebug
 	DqnV2 displayP;
 	i32   displayYOffset;

-	u64 cycleCount[DTRDebugCycleCount_Count];
-	u64 counter   [DTRDebugCounter_Count];
-	u64 totalSetPixels;
+	DTRDebugCycles cycles [DTRDebugCycleCount_Count];
+	u64            counter[DTRDebugCounter_Count];
+	u64            totalSetPixels;
 } DTRDebug;

 extern DTRDebug globalDebug;
--- a/src/DTRendererRender.cpp
+++ b/src/DTRendererRender.cpp
@ -7,6 +7,8 @@
 #include "external/stb_rect_pack.h"
 #include "external/stb_truetype.h"

+#include <intrin.h>
+
 FILE_SCOPE const f32 COLOR_EPSILON = 0.9f;

 FILE_SCOPE inline DqnV4 PreMultiplyAlpha1(const DqnV4 color)
@ -497,7 +499,8 @@ FILE_SCOPE void DebugBarycentricInternal(DqnV2 p, DqnV2 a, DqnV2 b, DqnV2 c, f32
 	*u        = 1.0f - *v - *w;
 }

-void DTRRender_TexturedTriangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3,
+void DTRRender_TexturedTriangle(PlatformInput *const input,
+                                DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3,
                                DqnV2 uv1, DqnV2 uv2, DqnV2 uv3, DTRBitmap *const texture,
                                DqnV4 color, const DTRRenderTransform transform)
 {
@ -549,125 +552,287 @@ void DTRRender_TexturedTriangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, D
 	const DqnV3 b = p2;
 	const DqnV3 c = p3;

-	DqnV2i startP = min;
-	f32 oldSignedArea1       = ((b.x - a.x) * (startP.y - a.y)) - ((b.y - a.y) * (startP.x - a.x));
-	f32 oldSignedArea2       = ((c.x - b.x) * (startP.y - b.y)) - ((c.y - b.y) * (startP.x - b.x));
-	f32 oldSignedArea3       = ((a.x - c.x) * (startP.y - c.y)) - ((a.y - c.y) * (startP.x - c.x));
-
-	f32 signedArea1       = ((b.x - a.x) * (startP.y - a.y)) - ((b.y - a.y) * (startP.x - a.x));
-	f32 signedArea1DeltaX = a.y - b.y;
-	f32 signedArea1DeltaY = b.x - a.x;
-
-	f32 signedArea2       = ((c.x - b.x) * (startP.y - b.y)) - ((c.y - b.y) * (startP.x - b.x));
-	f32 signedArea2DeltaX = b.y - c.y;
-	f32 signedArea2DeltaY = c.x - b.x;
-
-	f32 signedArea3       = ((a.x - c.x) * (startP.y - c.y)) - ((a.y - c.y) * (startP.x - c.x));
-	f32 signedArea3DeltaX = c.y - a.y;
-	f32 signedArea3DeltaY = a.x - c.x;
-
-	f32 signedAreaParallelogram = signedArea1 + signedArea2 + signedArea3;
-	if (signedAreaParallelogram == 0) return;
-	f32 invSignedAreaParallelogram = 1 / signedAreaParallelogram;
-
 	DTRDebug_BeginCycleCount(DTRDebugCycleCount_RenderTriangle_Rasterise);
 	////////////////////////////////////////////////////////////////////////////
 	// Scan and Render
 	////////////////////////////////////////////////////////////////////////////
-	const u32 zBufferPitch        = renderBuffer->width;
-	const f32 BARYCENTRIC_EPSILON = 0.1f;
-
-	u8 *texturePtr         = texture->memory;
-	const u32 texturePitch = texture->bytesPerPixel * texture->dim.w;
-	for (i32 bufferY = min.y; bufferY < max.y; bufferY++)
+	const u32 zBufferPitch = renderBuffer->width;
+	if (input->canUseSSE2)
 	{
-		f32 signedArea1Row = signedArea1;
-		f32 signedArea2Row = signedArea2;
-		f32 signedArea3Row = signedArea3;
+		DqnV2i startP                   = min;
+		f32 edge1SignedAreaPixel1       = ((b.x - a.x) * (startP.y - a.y)) - ((b.y - a.y) * (startP.x - a.x));
+		f32 edge1SignedAreaPixel1DeltaX = a.y - b.y;
+		f32 edge1SignedAreaPixel1DeltaY = b.x - a.x;

-		for (i32 bufferX = min.x; bufferX < max.x; bufferX++)
+		f32 edge2SignedAreaPixel1       = ((c.x - b.x) * (startP.y - b.y)) - ((c.y - b.y) * (startP.x - b.x));
+		f32 edge2SignedAreaPixel1DeltaX = b.y - c.y;
+		f32 edge2SignedAreaPixel1DeltaY = c.x - b.x;
+
+		f32 edge3SignedAreaPixel1       = ((a.x - c.x) * (startP.y - c.y)) - ((a.y - c.y) * (startP.x - c.x));
+		f32 edge3SignedAreaPixel1DeltaX = c.y - a.y;
+		f32 edge3SignedAreaPixel1DeltaY = a.x - c.x;
+
+		f32 signedAreaParallelogramPixel1 = edge1SignedAreaPixel1 + edge2SignedAreaPixel1 + edge3SignedAreaPixel1;
+		if (signedAreaParallelogramPixel1 == 0) return;
+		f32 invSignedAreaParallelogramPixel1 = 1 / signedAreaParallelogramPixel1;
+
+		__m128 zero_4x                      = _mm_set_ps1(0.0f);
+		__m128 two_4x                       = _mm_set_ps1(2.0f);
+		__m128 invSignedAreaParallelogram4x = _mm_set_ps1(invSignedAreaParallelogramPixel1);
+		__m128 triangleZ                    = _mm_set_ps(0, b.z, a.z, c.z);
+
+		__m128 signedAreaPixelDeltaX = _mm_set_ps(0, edge3SignedAreaPixel1DeltaX, edge2SignedAreaPixel1DeltaX, edge1SignedAreaPixel1DeltaX);
+		__m128 signedAreaPixelDeltaY = _mm_set_ps(0, edge3SignedAreaPixel1DeltaY, edge2SignedAreaPixel1DeltaY, edge1SignedAreaPixel1DeltaY);
+
+		__m128 signedAreaPixel1 = _mm_set_ps(0, edge3SignedAreaPixel1, edge2SignedAreaPixel1, edge1SignedAreaPixel1);
+		__m128 signedAreaPixel2 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaX);
+
+		// NOTE: Step size of 2 pixels across X
+		signedAreaPixelDeltaX = _mm_mul_ps(signedAreaPixelDeltaX, two_4x);
+
+		const DqnV2 uv2SubUv1 = uv2 - uv1;
+		const DqnV2 uv3SubUv1 = uv3 - uv1;
+
+		const u32 IS_GREATER_MASK = 0xF;
+
+		for (i32 bufferY = min.y; bufferY < max.y; bufferY++)
 		{
-			if (signedArea1Row >= 0 && signedArea2Row >= 0 && signedArea3Row >= 0)
+			__m128 signedArea1 = signedAreaPixel1;
+			__m128 signedArea2 = signedAreaPixel2;
+
+			for (i32 bufferX = min.x; bufferX < max.x; bufferX += 2)
 			{
-				f32 barycentricB = signedArea3Row * invSignedAreaParallelogram;
-				f32 barycentricC = signedArea1Row * invSignedAreaParallelogram;
-
-				if (DTR_DEBUG)
+				__m128 isGreater1    = _mm_cmpge_ps(signedArea1, zero_4x);
+				i32 isGreaterResult1 = _mm_movemask_ps(isGreater1);
+				if ((isGreaterResult1 & IS_GREATER_MASK) == IS_GREATER_MASK)
 				{
-					const f32 EPSILON = 0.1f;
+					__m128 barycentric  = _mm_mul_ps(signedArea1, invSignedAreaParallelogram4x);
+					__m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric);

-					f32 debugSignedArea1 = ((b.x - a.x) * (bufferY - a.y)) - ((b.y - a.y) * (bufferX - a.x));
-					f32 debugSignedArea2 = ((c.x - b.x) * (bufferY - b.y)) - ((c.y - b.y) * (bufferX - b.x));
-					f32 debugSignedArea3 = ((a.x - c.x) * (bufferY - c.y)) - ((a.y - c.y) * (bufferX - c.x));
+					i32 zBufferIndex = bufferX + (bufferY * zBufferPitch);
+					f32 pixelZValue = ((f32 *)&barycentricZ)[0] +
+					                  ((f32 *)&barycentricZ)[1] +
+					                  ((f32 *)&barycentricZ)[2];
+					f32 currZValue = renderBuffer->zBuffer[zBufferIndex];
+					if (pixelZValue > currZValue)
+					{
+						renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
+						u8 *texturePtr                      = texture->memory;
+						const u32 texturePitch = texture->bytesPerPixel * texture->dim.w;

-					f32 deltaSignedArea1 = debugSignedArea1 - signedArea1Row;
-					f32 deltaSignedArea2 = debugSignedArea2 - signedArea2Row;
-					f32 deltaSignedArea3 = debugSignedArea3 - signedArea3Row;
-					DQN_ASSERT(deltaSignedArea1 < EPSILON && deltaSignedArea2 < EPSILON &&
-					           deltaSignedArea3 < EPSILON)
+						f32 barycentricB = ((f32 *)&barycentric)[2];
+						f32 barycentricC = ((f32 *)&barycentric)[0];
+						DqnV2 uv = uv1 + (uv2SubUv1 * barycentricB) + (uv3SubUv1 * barycentricC);

-					f32 debugBarycentricA, debugBarycentricB, debugBarycentricC;
-					DebugBarycentricInternal(DqnV2_2i(bufferX, bufferY), a.xy, b.xy, c.xy,
-					                         &debugBarycentricA, &debugBarycentricB,
-					                         &debugBarycentricC);
+						const f32 EPSILON = 0.1f;
+						DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON);
+						DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON);
+						uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f);
+						uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f);

+						f32 texelXf = uv.x * texture->dim.w;
+						f32 texelYf = uv.y * texture->dim.h;
+						DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w);
+						DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h);

-					f32 deltaBaryB = DQN_ABS(barycentricB - debugBarycentricB);
-					f32 deltaBaryC = DQN_ABS(barycentricC - debugBarycentricC);
+						i32 texelX = (i32)texelXf;
+						i32 texelY = (i32)texelYf;
+
+						u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) +
+						                      (texelY * texturePitch));
+
+						DqnV4 color1;
+						color1.a = (f32)(texel1 >> 24);
+						color1.b = (f32)((texel1 >> 16) & 0xFF);
+						color1.g = (f32)((texel1 >> 8) & 0xFF);
+						color1.r = (f32)((texel1 >> 0) & 0xFF);
+						color1 *= DTRRENDER_INV_255;
+						color1      = DTRRender_SRGB1ToLinearSpaceV4(color1);
+						DqnV4 blend = color * color1;
+						SetPixel(renderBuffer, bufferX, bufferY, blend, ColorSpace_Linear);
+					}

-					DQN_ASSERT(deltaBaryB < EPSILON && deltaBaryC < EPSILON)
 				}

-				i32 zBufferIndex = bufferX + (bufferY * zBufferPitch);
-				f32 pixelZValue = a.z + (barycentricB * (b.z - a.z)) + (barycentricC * (c.z - a.z));
-				f32 currZValue  = renderBuffer->zBuffer[zBufferIndex];
-				DQN_ASSERT(zBufferIndex < (renderBuffer->width * renderBuffer->height));
-
-				if (pixelZValue > currZValue)
+				__m128 isGreater2    = _mm_cmpge_ps(signedArea2, zero_4x);
+				i32 isGreaterResult2 = _mm_movemask_ps(isGreater2);
+				i32 bufferX1         = bufferX + 1;
+				if ((isGreaterResult2 & IS_GREATER_MASK) == IS_GREATER_MASK && bufferX1 < max.x)
 				{
-					renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
-					const bool DEBUG_SAMPLE_TEXTURE = true;
-					DqnV2 uv = uv1 + ((uv2 - uv1) * barycentricB) + ((uv3 - uv1) * barycentricC);
+					__m128 barycentric  = _mm_mul_ps(signedArea2, invSignedAreaParallelogram4x);
+					__m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric);

-					const f32 EPSILON = 0.1f;
-					DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON);
-					DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON);
+					i32 zBufferIndex = bufferX1 + (bufferY * zBufferPitch);
+					f32 pixelZValue  = ((f32 *)&barycentricZ)[0] +
+					                   ((f32 *)&barycentricZ)[1] +
+					                   ((f32 *)&barycentricZ)[2];
+					f32 currZValue = renderBuffer->zBuffer[zBufferIndex];
+					if (pixelZValue > currZValue)
+					{
+						renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
+						u8 *texturePtr                      = texture->memory;
+						const u32 texturePitch = texture->bytesPerPixel * texture->dim.w;

-					uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f);
-					uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f);
+						f32 barycentricB = ((f32 *)&barycentric)[2];
+						f32 barycentricC = ((f32 *)&barycentric)[0];
+						DqnV2 uv = uv1 + (uv2SubUv1 * barycentricB) + (uv3SubUv1 * barycentricC);

-					f32 texelXf = uv.x * texture->dim.w;
-					f32 texelYf = uv.y * texture->dim.h;
-					DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w);
-					DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h);
+						const f32 EPSILON = 0.1f;
+						DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON);
+						DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON);
+						uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f);
+						uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f);

-					i32 texelX = (i32)texelXf;
-					i32 texelY = (i32)texelYf;
+						f32 texelXf = uv.x * texture->dim.w;
+						f32 texelYf = uv.y * texture->dim.h;
+						DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w);
+						DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h);

-					u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) +
-					                      (texelY * texturePitch));
+						i32 texelX = (i32)texelXf;
+						i32 texelY = (i32)texelYf;

-					DqnV4 color1;
-					color1.a = (f32)(texel1 >> 24);
-					color1.b = (f32)((texel1 >> 16) & 0xFF);
-					color1.g = (f32)((texel1 >> 8) & 0xFF);
-					color1.r = (f32)((texel1 >> 0) & 0xFF);
+						u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) +
+						                      (texelY * texturePitch));

-					color1 *= DTRRENDER_INV_255;
-					color1      = DTRRender_SRGB1ToLinearSpaceV4(color1);
-					DqnV4 blend = color * color1;
-					SetPixel(renderBuffer, bufferX, bufferY, blend, ColorSpace_Linear);
+						DqnV4 color1;
+						color1.a = (f32)(texel1 >> 24);
+						color1.b = (f32)((texel1 >> 16) & 0xFF);
+						color1.g = (f32)((texel1 >> 8) & 0xFF);
+						color1.r = (f32)((texel1 >> 0) & 0xFF);
+						color1 *= DTRRENDER_INV_255;
+						color1      = DTRRender_SRGB1ToLinearSpaceV4(color1);
+						DqnV4 blend = color * color1;
+						SetPixel(renderBuffer, bufferX1, bufferY, blend, ColorSpace_Linear);
+					}
 				}
+
+				signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX);
+				signedArea2 = _mm_add_ps(signedArea2, signedAreaPixelDeltaX);
 			}

-			signedArea1Row += signedArea1DeltaX;
-			signedArea2Row += signedArea2DeltaX;
-			signedArea3Row += signedArea3DeltaX;
+			signedAreaPixel1 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaY);
+			signedAreaPixel2 = _mm_add_ps(signedAreaPixel2, signedAreaPixelDeltaY);
 		}
+	}
+	else
+	{
+		DqnV2i startP         = min;
+		f32 signedArea1       = ((b.x - a.x) * (startP.y - a.y)) - ((b.y - a.y) * (startP.x - a.x));
+		f32 signedArea1DeltaX = a.y - b.y;
+		f32 signedArea1DeltaY = b.x - a.x;

-		signedArea1 += signedArea1DeltaY;
-		signedArea2 += signedArea2DeltaY;
-		signedArea3 += signedArea3DeltaY;
+		f32 signedArea2       = ((c.x - b.x) * (startP.y - b.y)) - ((c.y - b.y) * (startP.x - b.x));
+		f32 signedArea2DeltaX = b.y - c.y;
+		f32 signedArea2DeltaY = c.x - b.x;
+
+		f32 signedArea3       = ((a.x - c.x) * (startP.y - c.y)) - ((a.y - c.y) * (startP.x - c.x));
+		f32 signedArea3DeltaX = c.y - a.y;
+		f32 signedArea3DeltaY = a.x - c.x;
+
+		f32 signedAreaParallelogram = signedArea1 + signedArea2 + signedArea3;
+		if (signedAreaParallelogram == 0) return;
+		f32 invSignedAreaParallelogram = 1 / signedAreaParallelogram;
+
+		for (i32 bufferY = min.y; bufferY < max.y; bufferY++)
+		{
+			f32 signedArea1Row = signedArea1;
+			f32 signedArea2Row = signedArea2;
+			f32 signedArea3Row = signedArea3;
+
+			for (i32 bufferX = min.x; bufferX < max.x; bufferX++)
+			{
+				if (signedArea1Row >= 0 && signedArea2Row >= 0 && signedArea3Row >= 0)
+				{
+					f32 barycentricB = signedArea3Row * invSignedAreaParallelogram;
+					f32 barycentricC = signedArea1Row * invSignedAreaParallelogram;
+
+					if (DTR_DEBUG)
+					{
+						const f32 EPSILON = 0.1f;
+
+						f32 debugSignedArea1 = ((b.x - a.x) * (bufferY - a.y)) - ((b.y - a.y) * (bufferX - a.x));
+						f32 debugSignedArea2 = ((c.x - b.x) * (bufferY - b.y)) - ((c.y - b.y) * (bufferX - b.x));
+						f32 debugSignedArea3 = ((a.x - c.x) * (bufferY - c.y)) - ((a.y - c.y) * (bufferX - c.x));
+
+						f32 deltaSignedArea1 = DQN_ABS(debugSignedArea1 - signedArea1Row);
+						f32 deltaSignedArea2 = DQN_ABS(debugSignedArea2 - signedArea2Row);
+						f32 deltaSignedArea3 = DQN_ABS(debugSignedArea3 - signedArea3Row);
+						DQN_ASSERT(deltaSignedArea1 < EPSILON && deltaSignedArea2 < EPSILON &&
+						           deltaSignedArea3 < EPSILON)
+
+						f32 debugBarycentricA, debugBarycentricB, debugBarycentricC;
+						DebugBarycentricInternal(DqnV2_2i(bufferX, bufferY), a.xy, b.xy, c.xy,
+						                         &debugBarycentricA, &debugBarycentricB,
+						                         &debugBarycentricC);
+
+						f32 deltaBaryB = DQN_ABS(barycentricB - debugBarycentricB);
+						f32 deltaBaryC = DQN_ABS(barycentricC - debugBarycentricC);
+
+						DQN_ASSERT(deltaBaryB < EPSILON && deltaBaryC < EPSILON)
+					}
+
+					i32 zBufferIndex = bufferX + (bufferY * zBufferPitch);
+					f32 pixelZValue =
+					    a.z + (barycentricB * (b.z - a.z)) + (barycentricC * (c.z - a.z));
+					f32 currZValue = renderBuffer->zBuffer[zBufferIndex];
+					DQN_ASSERT(zBufferIndex < (renderBuffer->width * renderBuffer->height));
+
+					if (pixelZValue > currZValue)
+					{
+						renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
+						if (texture)
+						{
+							u8 *texturePtr         = texture->memory;
+							const u32 texturePitch = texture->bytesPerPixel * texture->dim.w;
+
+							DqnV2 uv =
+							    uv1 + ((uv2 - uv1) * barycentricB) + ((uv3 - uv1) * barycentricC);
+
+							const f32 EPSILON = 0.1f;
+							DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON);
+							DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON);
+
+							uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f);
+							uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f);
+
+							f32 texelXf = uv.x * texture->dim.w;
+							f32 texelYf = uv.y * texture->dim.h;
+							DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w);
+							DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h);
+
+							i32 texelX = (i32)texelXf;
+							i32 texelY = (i32)texelYf;
+
+							u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) +
+							                      (texelY * texturePitch));
+
+							DqnV4 color1;
+							color1.a = (f32)(texel1 >> 24);
+							color1.b = (f32)((texel1 >> 16) & 0xFF);
+							color1.g = (f32)((texel1 >> 8) & 0xFF);
+							color1.r = (f32)((texel1 >> 0) & 0xFF);
+
+							color1 *= DTRRENDER_INV_255;
+							color1      = DTRRender_SRGB1ToLinearSpaceV4(color1);
+							DqnV4 blend = color * color1;
+							SetPixel(renderBuffer, bufferX, bufferY, blend, ColorSpace_Linear);
+						}
+						else
+						{
+							SetPixel(renderBuffer, bufferX, bufferY, color, ColorSpace_Linear);
+						}
+					}
+				}
+
+				signedArea1Row += signedArea1DeltaX;
+				signedArea2Row += signedArea2DeltaX;
+				signedArea3Row += signedArea3DeltaX;
+			}
+
+			signedArea1 += signedArea1DeltaY;
+			signedArea2 += signedArea2DeltaY;
+			signedArea3 += signedArea3DeltaY;
+		}
 	}
 	DTRDebug_EndCycleCount(DTRDebugCycleCount_RenderTriangle_Rasterise);

@ -881,12 +1046,10 @@ void DTRRender_Triangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2,
 	if (signedAreaParallelogram == 0) return;
 	f32 invSignedAreaParallelogram = 1 / signedAreaParallelogram;

-	DTRDebug_BeginCycleCount(DTRDebugCycleCount_RenderTriangle_Rasterise);
 	////////////////////////////////////////////////////////////////////////////
 	// Scan and Render
 	////////////////////////////////////////////////////////////////////////////
-	const u32 zBufferPitch        = renderBuffer->width;
-	const f32 BARYCENTRIC_EPSILON = 0.1f;
+	const u32 zBufferPitch = renderBuffer->width;
 	for (i32 bufferY = min.y; bufferY < max.y; bufferY++)
 	{
 		f32 signedArea1Row = signedArea1;
@ -903,6 +1066,7 @@ void DTRRender_Triangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2,
 				i32 zBufferIndex = bufferX + (bufferY * zBufferPitch);
 				f32 pixelZValue = a.z + (barycentricB * (b.z - a.z)) + (barycentricC * (c.z - a.z));
 				f32 currZValue  = renderBuffer->zBuffer[zBufferIndex];
+				DQN_ASSERT(zBufferIndex < (renderBuffer->width * renderBuffer->height));
 				if (pixelZValue > currZValue)
 				{
 					renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
@ -919,7 +1083,6 @@ void DTRRender_Triangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2,
 		signedArea2 += signedArea2DeltaY;
 		signedArea3 += signedArea3DeltaY;
 	}
-	DTRDebug_EndCycleCount(DTRDebugCycleCount_RenderTriangle_Rasterise);

 	////////////////////////////////////////////////////////////////////////////
 	// Debug
@ -958,8 +1121,7 @@ void DTRRender_Triangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2,
 	}
 }

-void DTRRender_Bitmap(DTRRenderBuffer *const renderBuffer,
-                      DTRBitmap *const bitmap, DqnV2 pos,
+void DTRRender_Bitmap(DTRRenderBuffer *const renderBuffer, DTRBitmap *const bitmap, DqnV2 pos,
                      const DTRRenderTransform transform, DqnV4 color)
 {
 	if (!bitmap || !bitmap->memory || !renderBuffer) return;
--- a/src/DTRendererRender.h
+++ b/src/DTRendererRender.h
@ -63,7 +63,7 @@ void DTRRender_Text            (DTRRenderBuffer *const renderBuffer, const DTRFo
 void DTRRender_Line            (DTRRenderBuffer *const renderBuffer, DqnV2i a, DqnV2i b, DqnV4 color);
 void DTRRender_Rectangle       (DTRRenderBuffer *const renderBuffer, DqnV2 min, DqnV2 max, DqnV4 color, const DTRRenderTransform transform = DTRRender_DefaultTransform());
 void DTRRender_Triangle        (DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3, DqnV4 color, const DTRRenderTransform transform = DTRRender_DefaultTriangleTransform());
-void DTRRender_TexturedTriangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3, DqnV2 uv1, DqnV2 uv2, DqnV2 uv3, DTRBitmap *const texture, DqnV4 color, const DTRRenderTransform transform = DTRRender_DefaultTriangleTransform());
+void DTRRender_TexturedTriangle(PlatformInput *const input, DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3, DqnV2 uv1, DqnV2 uv2, DqnV2 uv3, DTRBitmap *const texture, DqnV4 color, const DTRRenderTransform transform = DTRRender_DefaultTriangleTransform());
 void DTRRender_Bitmap          (DTRRenderBuffer *const renderBuffer, DTRBitmap *const bitmap, DqnV2 pos, const DTRRenderTransform transform = DTRRender_DefaultTransform(), DqnV4 color = DqnV4_4f(1, 1, 1, 1));
 void DTRRender_Clear           (DTRRenderBuffer *const renderBuffer, DqnV3 color);

--- a/src/build.bat
+++ b/src/build.bat
@ -39,7 +39,7 @@ REM wd4100 unused argument parameters
 REM wd4201 nonstandard extension used: nameless struct/union
 REM wd4189 local variable is initialised but not referenced
 REM wd4505 unreferenced local function not used will be removed
-set CompileFlags=-EHsc -GR- -Oi -MT -Z7 -W4 -wd4100 -wd4201 -wd4189 -wd4505 -Od -FAsc /I..\src\external\
+set CompileFlags=-EHsc -GR- -Oi -MT -Z7 -W4 -wd4100 -wd4201 -wd4189 -wd4505 -O2 -FAsc /I..\src\external\
 set DLLFlags=/Fm%ProjectName% /Fo%ProjectName% /Fa%ProjectName% /Fe%ProjectName%
 set Win32Flags=/FmWin32DTRenderer /FeWin32DTRenderer

@ -62,7 +62,7 @@ REM ////////////////////////////////////////////////////////////////////////////
 del *.pdb >NUL 2>NUL
 cl %CompileFlags% %Win32Flags% ..\src\Win32DTRenderer.cpp /link %LinkLibraries% %LinkFlags%
 REM cl %CompileFlags% %DLLFlags%   ..\src\UnityBuild\UnityBuild.cpp /LD /link ..\src\external\easy\easy_profiler.lib /PDB:%ProjectName%_%TimeStamp%.pdb /export:DTR_Update %LinkFlags%
-cl %CompileFlags% %DLLFlags%   ..\src\UnityBuild\UnityBuild.cpp /LD /link /PDB:%ProjectName%_%TimeStamp%.pdb /export:DTR_Update %LinkFlags%
+cl %CompileFlags% %DLLFlags%  ..\src\UnityBuild\UnityBuild.cpp /LD /link /PDB:%ProjectName%_%TimeStamp%.pdb /export:DTR_Update %LinkFlags%

 popd
 set LastError=%ERRORLEVEL%