Start separating inline SIMD to own functions

2017-06-01 18:52:50 +10:00 · 2017-06-01 18:52:50 +10:00 · 9b4072f5bc
commit 9b4072f5bc
parent 6bcdb6d1fb
2 changed files with 401 additions and 211 deletions
--- a/src/DTRendererDebug.h
+++ b/src/DTRendererDebug.h
@ -44,12 +44,15 @@ enum DTRDebugCounter
 enum DTRDebugCycleCount
 {
 	DTRDebugCycleCount_RenderTexturedTriangle_Rasterise,
+	DTRDebugCycleCount_RenderTexturedTriangle_SampleTextureFunction,
+	DTRDebugCycleCount_RenderTexturedTriangle_SampleTexture,
 	DTRDebugCycleCount_RenderTriangle_Rasterise,
 	DTRDebugCycleCount_Count,
 };

 typedef struct DTRDebugCycles
 {
+	char *name;
 	u64 totalCycles;
 	u64 numInvokes;

--- a/src/DTRendererRender.cpp
+++ b/src/DTRendererRender.cpp
@ -11,6 +11,18 @@

 FILE_SCOPE const f32 COLOR_EPSILON = 0.9f;

+inline void Make3PointsClockwise(DqnV3 *p1, DqnV3 *p2, DqnV3 *p3)
+{
+	f32 area2Times = ((p2->x - p1->x) * (p2->y + p1->y)) +
+	                 ((p3->x - p2->x) * (p3->y + p2->y)) +
+	                 ((p1->x - p3->x) * (p1->y + p3->y));
+	if (area2Times > 0)
+	{
+		// Clockwise swap any point to make it clockwise
+		DQN_SWAP(DqnV3, *p2, *p3);
+	}
+}
+
 FILE_SCOPE inline DqnV4 PreMultiplyAlpha1(const DqnV4 color)
 {
 	DQN_ASSERT(color.a >= 0.0f && color.a <= 1.0f);
@ -61,20 +73,9 @@ inline f32 DTRRender_SRGB1ToLinearSpacef(f32 val)
 inline DqnV4 DTRRender_SRGB1ToLinearSpaceV4(DqnV4 color)
 {
 	DqnV4 result;
-	if (globalDTRPlatformFlags.canUseSSE2)
-	{
-		__m128 simdColor  = _mm_set_ps(color.r, color.g, color.b, 0);
-		__m128 simdResult = _mm_mul_ps(simdColor, simdColor);
-		result.r          = ((f32 *)&simdResult)[3];
-		result.g          = ((f32 *)&simdResult)[2];
-		result.b          = ((f32 *)&simdResult)[1];
-	}
-	else
-	{
-		result.r = DTRRender_SRGB1ToLinearSpacef(color.r);
-		result.g = DTRRender_SRGB1ToLinearSpacef(color.g);
-		result.b = DTRRender_SRGB1ToLinearSpacef(color.b);
-	}
+	result.r = DTRRender_SRGB1ToLinearSpacef(color.r);
+	result.g = DTRRender_SRGB1ToLinearSpacef(color.g);
+	result.b = DTRRender_SRGB1ToLinearSpacef(color.b);
 	result.a = color.a;

 	return result;
@ -91,20 +92,9 @@ inline f32 DTRRender_LinearToSRGB1Spacef(f32 val)
 inline DqnV4 DTRRender_LinearToSRGB1SpaceV4(DqnV4 color)
 {
 	DqnV4 result;
-	if (globalDTRPlatformFlags.canUseSSE2)
-	{
-		__m128 simdColor  = _mm_set_ps(color.r, color.g, color.b, 0);
-		__m128 simdResult = _mm_sqrt_ps(simdColor);
-		result.r          = ((f32 *)&simdResult)[3];
-		result.g          = ((f32 *)&simdResult)[2];
-		result.b          = ((f32 *)&simdResult)[1];
-	}
-	else
-	{
-		result.r = DTRRender_LinearToSRGB1Spacef(color.r);
-		result.g = DTRRender_LinearToSRGB1Spacef(color.g);
-		result.b = DTRRender_LinearToSRGB1Spacef(color.b);
-	}
+	result.r = DTRRender_LinearToSRGB1Spacef(color.r);
+	result.g = DTRRender_LinearToSRGB1Spacef(color.g);
+	result.b = DTRRender_LinearToSRGB1Spacef(color.b);
 	result.a = color.a;

 	return result;
@ -139,7 +129,6 @@ FILE_SCOPE inline void SetPixel(DTRRenderBuffer *const renderBuffer, const i32 x
 	if (needGammaFix) color = DTRRender_SRGB1ToLinearSpaceV4(color);

 	u32 src = bitmapPtr[x + (y * pitchInU32)];
-#if 0
 	f32 srcR = (f32)((src >> 16) & 0xFF) * DTRRENDER_INV_255;
 	f32 srcG = (f32)((src >> 8) & 0xFF)  * DTRRENDER_INV_255;
 	f32 srcB = (f32)((src >> 0) & 0xFF)  * DTRRENDER_INV_255;
@ -181,44 +170,6 @@ FILE_SCOPE inline void SetPixel(DTRRenderBuffer *const renderBuffer, const i32 x
 		destB = 255;
 	}

-#else
-	__m128 simdSrc = _mm_set_ps(0.0f,
-	                            (f32)((src >> 16) & 0xFF),
-	                            (f32)((src >> 8) & 0xFF),
-	                            (f32)((src >> 0) & 0xFF));
-
-	__m128 inv255_4x = _mm_set_ps1(DTRRENDER_INV_255);
-	simdSrc          = _mm_mul_ps(simdSrc, inv255_4x);
-	simdSrc          = _mm_mul_ps(simdSrc, simdSrc); // to linear
-
-	f32 invANorm       = 1 - color.a;
-	__m128 invANorm_4x = _mm_set_ps1(invANorm);
-	__m128 const255_4x = _mm_set_ps1(255.0f);
-
-	__m128 simdColor = _mm_set_ps(0, color.r, color.g, color.b);
-	__m128 dest      = _mm_add_ps(simdColor, _mm_mul_ps(simdSrc, invANorm_4x)); // to 0->1 range
-	dest             = _mm_sqrt_ps(dest);                                   // to srgb
-	dest             = _mm_mul_ps(dest, const255_4x);                       // to 0->255 range
-
-	DQN_ASSERT(((f32 *)&dest)[2] >= 0);
-	DQN_ASSERT(((f32 *)&dest)[1] >= 0);
-	DQN_ASSERT(((f32 *)&dest)[0] >= 0);
-
-	if (DTR_DEBUG)
-	{
-		DQN_ASSERT((((f32 *)&dest)[2] - 255.0f) < COLOR_EPSILON);
-		DQN_ASSERT((((f32 *)&dest)[1] - 255.0f) < COLOR_EPSILON);
-		DQN_ASSERT((((f32 *)&dest)[0] - 255.0f) < COLOR_EPSILON);
-	}
-
-	dest = _mm_min_ps(dest, const255_4x);
-
-	f32 destR = ((f32 *)&dest)[2];
-	f32 destG = ((f32 *)&dest)[1];
-	f32 destB = ((f32 *)&dest)[0];
-
-#endif
-
 	u32 pixel = // ((u32)(destA) << 24 |
 	             (u32)(destR) << 16 |
 	             (u32)(destG) << 8 |
@ -582,29 +533,24 @@ typedef struct TriangleInclusionTest

 typedef struct SIMDTriangleInclusionTest
 {
-	DqnV2i boundsMin;
-	DqnV2i boundsMax;
-
 	__m128 vertexZValues;
 	__m128 signedAreaPixelDeltaX;
 	__m128 signedAreaPixelDeltaY;
 	__m128 invSignedAreaParallelogram_4x;
-
 	__m128 startPixel;
+
+	DqnV2i boundsMin;
+	DqnV2i boundsMax;
+	DqnV3  p1;
+	DqnV3  p2;
+	DqnV3  p3;
 } SIMDTriangleInclusionTest;

 FILE_SCOPE TriangleInclusionTest CreateTriangleInclusionTest(const i32 clipWidth,
                                                             const i32 clipHeight, DqnV3 p1,
                                                             DqnV3 p2, DqnV3 p3)
 {
-	f32 area2Times = ((p2.x - p1.x) * (p2.y + p1.y)) + ((p3.x - p2.x) * (p3.y + p2.y)) +
-	                 ((p1.x - p3.x) * (p1.y + p3.y));
-	if (area2Times > 0)
-	{
-		// Clockwise swap any point to make it clockwise
-		DQN_SWAP(DqnV3, p2, p3);
-	}
-
+	Make3PointsClockwise(&p1, &p2, &p3);
 	TriangleInclusionTest result = {};

 	result.boundsMin   = DqnV2i_2f(DQN_MIN(DQN_MIN(p1.x, p2.x), p3.x), DQN_MIN(DQN_MIN(p1.y, p2.y), p3.y));
@ -757,6 +703,9 @@ FILE_SCOPE SIMDTriangleInclusionTest CreateSimdTriangleInclusionTest(
 	result.boundsMax = inclusionTest.boundsMax;

 	// NOTE: Order is important here!
+	result.p1 = p1;
+	result.p2 = p2;
+	result.p3 = p3;
 	result.vertexZValues         = _mm_set_ps(0, p3.z, p2.z, p1.z);
 	result.signedAreaPixelDeltaX = _mm_set_ps(0,
 	                                          inclusionTest.signedAreaP3DeltaX,
@ -774,7 +723,6 @@ FILE_SCOPE SIMDTriangleInclusionTest CreateSimdTriangleInclusionTest(
 	return result;
 }

-
 inline void RasteriseTexturedTriangle(DTRRenderBuffer *const renderBuffer, const DqnV3 p1,
                                      const DqnV3 p2, const DqnV3 p3, const DqnV2 uv1,
                                      const DqnV2 uv2, const DqnV2 uv3, DTRBitmap *const texture,
@ -910,14 +858,215 @@ inline void RasteriseTexturedTriangle(DTRRenderBuffer *const renderBuffer, const
 	}
 }

-inline void SIMDRasteriseTexturedTriangle(DTRRenderBuffer *const renderBuffer, const DqnV3 p1,
-                                          const DqnV3 p2, const DqnV3 p3, const DqnV2 uv1,
-                                          const DqnV2 uv2, const DqnV2 uv3,
-                                          DTRBitmap *const texture, const DqnV4 color)
+FILE_SCOPE inline f32 Triangle2TimesSignedArea(const DqnV2 a, const DqnV2 b, const DqnV2 c)
 {
+	f32 result = ((b.x - a.x) * (c.y - a.y)) - ((b.y - a.y) * (c.x - a.x));
+	return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// SIMD
+////////////////////////////////////////////////////////////////////////////////
+// color: _mm_set_ps(a, b, g, r) ie. 0=r, 1=g, 2=b, 3=a
+FILE_SCOPE inline void SIMDDebug_ColorInRange(__m128 color, f32 min, f32 max)
+{
+	if (DTR_DEBUG)
+	{
+		f32 r = ((f32 *)&color)[0];
+		f32 g = ((f32 *)&color)[1];
+		f32 b = ((f32 *)&color)[2];
+		f32 a = ((f32 *)&color)[3];
+		DQN_ASSERT(r >= min && r <= max);
+		DQN_ASSERT(g >= min && g <= max);
+		DQN_ASSERT(b >= min && b <= max);
+		DQN_ASSERT(a >= min && a <= max);
+	}
+}
+
+FILE_SCOPE inline __m128 SIMD_SRGB1ToLinearSpace(__m128 color)
+{
+	SIMDDebug_ColorInRange(color, 0.0f, 1.0f);
+
+	f32 preserveAlpha   = ((f32 *)&color)[3];
+	__m128 result       = _mm_mul_ps(color, color);
+	((f32 *)&result)[3] = preserveAlpha;
+
+	return result;
+}
+
+FILE_SCOPE inline __m128 SIMD_SRGB255ToLinearSpace1(__m128 color)
+{
+	LOCAL_PERSIST const __m128 INV255_4X = _mm_set_ps1(DTRRENDER_INV_255);
+	color                                = _mm_mul_ps(color, INV255_4X);
+
+	f32 preserveAlpha   = ((f32 *)&color)[3];
+	__m128 result       = _mm_mul_ps(color, color);
+	((f32 *)&result)[3] = preserveAlpha;
+
+	return result;
+}
+
+FILE_SCOPE inline __m128 SIMD_LinearSpace1ToSRGB1(__m128 color)
+{
+	SIMDDebug_ColorInRange(color, 0.0f, 1.0f);
+
+	f32 preserveAlpha   = ((f32 *)&color)[3];
+	__m128 result       = _mm_sqrt_ps(color);
+	((f32 *)&result)[3] = preserveAlpha;
+
+	return result;
+}
+
+
+// color: _mm_set_ps(a, b, g, r) ie. 0=r, 1=g, 2=b, 3=a
+FILE_SCOPE inline __m128 SIMD_PreMultiplyAlpha1(__m128 color)
+{
+	f32 alpha        = ((f32 *)&color)[3];
+	__m128 simdAlpha = _mm_set_ps(1, alpha, alpha, alpha);
+	__m128 result    = _mm_mul_ps(color, simdAlpha);
+
+	return result;
+}
+
+FILE_SCOPE inline DqnV2 Get2DOriginFromTransformAnchor(const DqnV2 p1, const DqnV2 p2,
+                                                       const DqnV2 p3,
+                                                       const DTRRenderTransform transform)
+{
+	DqnV2 p1p2 = p2 - p1;
+	DqnV2 p1p3 = p3 - p1;
+
+	DqnV2 p1p2Anchored = p1p2 * transform.anchor;
+	DqnV2 p1p3Anchored = p1p3 * transform.anchor;
+	DqnV2 origin       = p1 + p1p2Anchored + p1p3Anchored;
+
+	return origin;
+}
+
+// color: _mm_set_ps(a, b, g, r) ie. 0=r, 1=g, 2=b, 3=a
+FILE_SCOPE inline void SIMD_SetPixel(DTRRenderBuffer *const renderBuffer, const i32 x, const i32 y,
+                                     __m128 color,
+                                     const enum ColorSpace colorSpace = ColorSpace_SRGB)
+{
+	if (!renderBuffer) return;
+	if (x < 0 || x > (renderBuffer->width - 1)) return;
+	if (y < 0 || y > (renderBuffer->height - 1)) return;
+
+	DTR_DEBUG_EP_TIMED_FUNCTION();
+	SIMDDebug_ColorInRange(color, 0.0f, 1.0f);
+
+	u32 *const bitmapPtr = (u32 *)renderBuffer->memory;
+	const u32 pitchInU32 = (renderBuffer->width * renderBuffer->bytesPerPixel) / 4;
+
+	// If some alpha is involved, we need to apply gamma correction, but if the
+	// new pixel is totally opaque or invisible then we're just flat out
+	// overwriting/keeping the state of the pixel so we can save cycles by skipping.
+	f32 alpha = ((f32 *)&color)[3];
+	bool needGammaFix = (alpha > 0.0f || alpha < (1.0f + COLOR_EPSILON)) && (colorSpace == ColorSpace_SRGB);
+	if (needGammaFix) color = SIMD_SRGB1ToLinearSpace(color);
+
+	// Format: u32 == (XX, RR, GG, BB)
+	u32 srcPixel = bitmapPtr[x + (y * pitchInU32)];
+	__m128 src = _mm_set_ps(0,
+	                        (f32)((srcPixel >> 0) & 0xFF),
+	                        (f32)((srcPixel >> 8) & 0xFF),
+	                        (f32)((srcPixel >> 16) & 0xFF));
+	src = SIMD_SRGB255ToLinearSpace1(src);
+
+	f32 invA       = 1 - alpha;
+	__m128 invA_4x = _mm_set_ps1(invA);
+
+	// PreAlphaMulColor + (1 - Alpha) * Src
+	__m128 oneMinusAlphaSrc = _mm_mul_ps(invA_4x, src);
+	__m128 dest             = _mm_add_ps(color, oneMinusAlphaSrc);
+	dest                    = SIMD_LinearSpace1ToSRGB1(dest);
+	dest                    = _mm_mul_ps(dest, _mm_set_ps1(255.0f)); // to 0->255 range
+
+	SIMDDebug_ColorInRange(dest, 0.0f, 255.0f);
+
+	f32 destR = ((f32 *)&dest)[0];
+	f32 destG = ((f32 *)&dest)[1];
+	f32 destB = ((f32 *)&dest)[2];
+
+	u32 pixel = // ((u32)(destA) << 24 |
+	             (u32)(destR) << 16 |
+	             (u32)(destG) << 8 |
+	             (u32)(destB) << 0;
+	bitmapPtr[x + (y * pitchInU32)] = pixel;
+
+	DTRDebug_CounterIncrement(DTRDebugCounter_SetPixels);
+}
+
+// colorModulate: _mm_set_ps(a, b, g, r)     ie. 0=r, 1=g, 2=b, 3=a
+// barycentric:   _mm_set_ps(xx, p3, p2, p1) ie. 0=p1, 1=p2, 2=p3, 3=a
+FILE_SCOPE __m128 SIMD_SampleTextureForTriangle(DTRBitmap *const texture, const DqnV2 uv1,
+                                                const DqnV2 uv2SubUv1, const DqnV2 uv3SubUv1,
+                                                const __m128 barycentric)
+{
+	DTRDebug_BeginCycleCount(DTRDebugCycleCount_RenderTexturedTriangle_SampleTextureFunction);
+	LOCAL_PERSIST const __m128 INV255_4X = _mm_set_ps1(1.0f / 255.0f);
+
+	const f32 barycentricP2 = ((f32 *)&barycentric)[1];
+	const f32 barycentricP3 = ((f32 *)&barycentric)[2];
+	DqnV2 uv                = uv1 + (uv2SubUv1 * barycentricP2) + (uv3SubUv1 * barycentricP3);
+
+	const f32 EPSILON = 0.1f;
+	DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON);
+	DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON);
+	uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f);
+	uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f);
+
+	f32 texelXf = uv.x * texture->dim.w;
+	f32 texelYf = uv.y * texture->dim.h;
+	DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w);
+	DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h);
+
+	i32 texelX = (i32)texelXf;
+	i32 texelY = (i32)texelYf;
+
+	const u32 texturePitch     = texture->bytesPerPixel * texture->dim.w;
+	const u8 *const texturePtr = texture->memory;
+	u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) + (texelY * texturePitch));
+
+	__m128 color = _mm_set_ps((f32)(texel1 >> 24),
+	                          (f32)((texel1 >> 16) & 0xFF),
+	                          (f32)((texel1 >> 8) & 0xFF),
+	                          (f32)((texel1 >> 0) & 0xFF));
+
+	color = SIMD_SRGB255ToLinearSpace1(color);
+	DTRDebug_EndCycleCount(DTRDebugCycleCount_RenderTexturedTriangle_SampleTextureFunction);
+	return color;
+}
+
+FILE_SCOPE void SIMD_TexturedTriangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2,
+                                      DqnV3 p3, DqnV2 uv1, DqnV2 uv2, DqnV2 uv3,
+                                      DTRBitmap *const texture, DqnV4 color,
+                                      const DTRRenderTransform transform)
+{
+	DTR_DEBUG_EP_TIMED_FUNCTION();
 	////////////////////////////////////////////////////////////////////////////
-	// Calculate Bounding Box
+	// Convert color
 	////////////////////////////////////////////////////////////////////////////
+	__m128 simdColor = _mm_set_ps(color.a, color.b, color.g, color.r);
+	simdColor        = SIMD_SRGB1ToLinearSpace(simdColor);
+	simdColor        = SIMD_PreMultiplyAlpha1(simdColor);
+
+	////////////////////////////////////////////////////////////////////////////
+	// Transform vertexes p1, p2, p3 inplace
+	////////////////////////////////////////////////////////////////////////////
+	{
+		Make3PointsClockwise(&p1, &p2, &p3);
+
+		// TODO(doyle): Transform is only in 2d right now
+		DqnV2 origin   = Get2DOriginFromTransformAnchor(p1.xy, p2.xy, p3.xy, transform);
+		DqnV2 pList[3] = {p1.xy - origin, p2.xy - origin, p3.xy - origin};
+		TransformPoints(origin, pList, DQN_ARRAY_COUNT(pList), transform.scale, transform.rotation);
+
+		p1.xy = pList[0];
+		p2.xy = pList[1];
+		p3.xy = pList[2];
+	}
+
+	DTRDebug_BeginCycleCount(DTRDebugCycleCount_RenderTexturedTriangle_Rasterise);
 	DqnV2i max = DqnV2i_2f(DQN_MAX(DQN_MAX(p1.x, p2.x), p3.x),
 	                       DQN_MAX(DQN_MAX(p1.y, p2.y), p3.y));
 	DqnV2i min = DqnV2i_2f(DQN_MIN(DQN_MIN(p1.x, p2.x), p3.x),
@ -927,56 +1076,73 @@ inline void SIMDRasteriseTexturedTriangle(DTRRenderBuffer *const renderBuffer, c
 	max.x = DQN_MIN(max.x, renderBuffer->width - 1);
 	max.y = DQN_MIN(max.y, renderBuffer->height - 1);

-	const u32 zBufferPitch = renderBuffer->width;
-	const DqnV3 a          = p1;
-	const DqnV3 b          = p2;
-	const DqnV3 c          = p3;

-	DqnV2i startP         = min;
-	f32 signedAreaC       = ((b.x - a.x) * (startP.y - a.y)) - ((b.y - a.y) * (startP.x - a.x));
-	f32 signedAreaCDeltaX = a.y - b.y;
-	f32 signedAreaCDeltaY = b.x - a.x;
-
-	f32 signedAreaA       = ((c.x - b.x) * (startP.y - b.y)) - ((c.y - b.y) * (startP.x - b.x));
-	f32 signedAreaADeltaX = b.y - c.y;
-	f32 signedAreaADeltaY = c.x - b.x;
-
-	f32 signedAreaB       = ((a.x - c.x) * (startP.y - c.y)) - ((a.y - c.y) * (startP.x - c.x));
-	f32 signedAreaBDeltaX = c.y - a.y;
-	f32 signedAreaBDeltaY = a.x - c.x;
-
-	f32 signedAreaParallelogram = signedAreaC + signedAreaA + signedAreaB;
-	if (signedAreaParallelogram == 0) return;
-	f32 invSignedAreaParallelogram = 1.0f / signedAreaParallelogram;
-
-	__m128 invSignedAreaParallelogram_4x = _mm_set_ps1(invSignedAreaParallelogram);
-
-	// NOTE: Order is important here!
-	__m128 triangleZ             = _mm_set_ps(0, b.z, a.z, c.z);
-	__m128 signedAreaPixelDeltaX = _mm_set_ps(0, signedAreaBDeltaX, signedAreaADeltaX, signedAreaCDeltaX);
-	__m128 signedAreaPixelDeltaY = _mm_set_ps(0, signedAreaBDeltaY, signedAreaADeltaY, signedAreaCDeltaY);
-	__m128 signedAreaPixel1      = _mm_set_ps(0, signedAreaB, signedAreaA, signedAreaC);
-	__m128 signedAreaPixel2      = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaX);
+	////////////////////////////////////////////////////////////////////////////
+	// Setup SIMD data
+	////////////////////////////////////////////////////////////////////////////
+	const u32 NUM_X_PIXELS_TO_SIMD = 2;
+	const u32 NUM_Y_PIXELS_TO_SIMD = 1;

 	const __m128 INV255_4X    = _mm_set_ps1(1.0f / 255.0f);
 	const __m128 ZERO_4X      = _mm_set_ps1(0.0f);
 	const u32 IS_GREATER_MASK = 0xF;

-	const u32 NUM_X_PIXELS_TO_SIMD = 2;
-	const u32 NUM_Y_PIXELS_TO_SIMD = 1;
-	const __m128 STEP_X_4X         = _mm_set_ps1((f32)NUM_X_PIXELS_TO_SIMD);
-	const __m128 STEP_Y_4X         = _mm_set_ps1((f32)NUM_Y_PIXELS_TO_SIMD);
+	// SignedArea: _mm_set_ps(unused, p3, p2, p1) ie 0=p1, 1=p1, 2=p3, 3=unused
+	__m128 signedAreaPixel1;
+	__m128 signedAreaPixel2;

-	// NOTE: Increase step size to the number of pixels rasterised with SIMD
-	signedAreaPixelDeltaX = _mm_mul_ps(signedAreaPixelDeltaX, STEP_X_4X);
-	signedAreaPixelDeltaY = _mm_mul_ps(signedAreaPixelDeltaY, STEP_Y_4X);
+	__m128 signedAreaPixelDeltaX;
+	__m128 signedAreaPixelDeltaY;
+	__m128 invSignedAreaParallelogram_4x;
+
+	__m128 triangleZ = _mm_set_ps(0, p3.z, p2.z, p1.z);
+	{
+		DqnV2i startP         = min;
+		f32 signedArea1Start  = Triangle2TimesSignedArea(p2.xy, p3.xy, DqnV2_V2i(startP));
+		f32 signedArea1DeltaX = p2.y - p3.y;
+		f32 signedArea1DeltaY = p3.x - p2.x;
+
+		f32 signedArea2Start  = Triangle2TimesSignedArea(p3.xy, p1.xy, DqnV2_V2i(startP));
+		f32 signedArea2DeltaX = p3.y - p1.y;
+		f32 signedArea2DeltaY = p1.x - p3.x;
+
+		f32 signedArea3Start  = Triangle2TimesSignedArea(p1.xy, p2.xy, DqnV2_V2i(startP));
+		f32 signedArea3DeltaX = p1.y - p2.y;
+		f32 signedArea3DeltaY = p2.x - p1.x;
+
+		f32 signedAreaParallelogram = signedArea1Start + signedArea2Start + signedArea3Start;
+		if (signedAreaParallelogram == 0) return;
+
+		f32 invSignedAreaParallelogram = 1.0f / signedAreaParallelogram;
+		invSignedAreaParallelogram_4x  = _mm_set_ps1(invSignedAreaParallelogram);
+
+		// NOTE: Order is important here!
+		signedAreaPixelDeltaX = _mm_set_ps(0, signedArea3DeltaX, signedArea2DeltaX, signedArea1DeltaX);
+		signedAreaPixelDeltaY = _mm_set_ps(0, signedArea3DeltaY, signedArea2DeltaY, signedArea1DeltaY);
+
+		signedAreaPixel1 = _mm_set_ps(0, signedArea3Start, signedArea2Start, signedArea1Start);
+		signedAreaPixel2 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaX);
+
+		// NOTE: Increase step size to the number of pixels rasterised with SIMD
+		{
+			const __m128 STEP_X_4X = _mm_set_ps1((f32)NUM_X_PIXELS_TO_SIMD);
+			const __m128 STEP_Y_4X = _mm_set_ps1((f32)NUM_Y_PIXELS_TO_SIMD);
+
+			signedAreaPixelDeltaX = _mm_mul_ps(signedAreaPixelDeltaX, STEP_X_4X);
+			signedAreaPixelDeltaY = _mm_mul_ps(signedAreaPixelDeltaY, STEP_Y_4X);
+		}
+
+	}

 	const DqnV2 uv2SubUv1      = uv2 - uv1;
 	const DqnV2 uv3SubUv1      = uv3 - uv1;
-	const __m128 colorModulate = _mm_set_ps(color.a, color.b, color.g, color.r);
 	const u32 texturePitch     = texture->bytesPerPixel * texture->dim.w;
 	const u8 *const texturePtr = texture->memory;
+	const u32 zBufferPitch     = renderBuffer->width;

+	////////////////////////////////////////////////////////////////////////////
+	// Scan and Render
+	////////////////////////////////////////////////////////////////////////////
 	for (i32 bufferY = min.y; bufferY < max.y; bufferY += NUM_Y_PIXELS_TO_SIMD)
 	{
 		__m128 signedArea1 = signedAreaPixel1;
@ -984,6 +1150,7 @@ inline void SIMDRasteriseTexturedTriangle(DTRRenderBuffer *const renderBuffer, c

 		for (i32 bufferX = min.x; bufferX < max.x; bufferX += NUM_X_PIXELS_TO_SIMD)
 		{
+
 			// Rasterise buffer(X, Y) pixel
 			{
 				__m128 checkArea    = signedArea1;
@ -992,58 +1159,25 @@ inline void SIMDRasteriseTexturedTriangle(DTRRenderBuffer *const renderBuffer, c
 				i32 posX            = bufferX;
 				i32 posY            = bufferY;

-				if ((isGreaterResult & IS_GREATER_MASK) == IS_GREATER_MASK && posX < max.x)
+				if ((isGreaterResult & IS_GREATER_MASK) == IS_GREATER_MASK)
 				{
 					__m128 barycentric  = _mm_mul_ps(checkArea, invSignedAreaParallelogram_4x);
 					__m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric);

 					i32 zBufferIndex = posX + (posY * zBufferPitch);
 					f32 pixelZValue  = ((f32 *)&barycentricZ)[0] +
-					                    ((f32 *)&barycentricZ)[1] +
-					                    ((f32 *)&barycentricZ)[2];
+					                   ((f32 *)&barycentricZ)[1] +
+					                   ((f32 *)&barycentricZ)[2];
 					f32 currZValue = renderBuffer->zBuffer[zBufferIndex];
 					if (pixelZValue > currZValue)
 					{
 						renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
-
-						f32 barycentricB = ((f32 *)&barycentric)[2];
-						f32 barycentricC = ((f32 *)&barycentric)[0];
-						DqnV2 uv = uv1 + (uv2SubUv1 * barycentricB) + (uv3SubUv1 * barycentricC);
-
-						const f32 EPSILON = 0.1f;
-						DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON);
-						DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON);
-						uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f);
-						uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f);
-
-						f32 texelXf = uv.x * texture->dim.w;
-						f32 texelYf = uv.y * texture->dim.h;
-						DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w);
-						DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h);
-
-						i32 texelX = (i32)texelXf;
-						i32 texelY = (i32)texelYf;
-
-						u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) + (texelY * texturePitch));
-
-						__m128 color1 = _mm_set_ps((f32)(texel1 >> 24),
-						                           (f32)((texel1 >> 16) & 0xFF),
-						                           (f32)((texel1 >> 8) & 0xFF),
-						                           (f32)((texel1 >> 0) & 0xFF));
-						color1              = _mm_mul_ps(color1, INV255_4X);
-						f32 preserveAlpha   = ((f32 *)&color1)[3];
-						color1              = _mm_mul_ps(color1, color1); // to linear space
-						((f32 *)&color1)[3] = preserveAlpha;
-						color1              = _mm_mul_ps(color1, colorModulate);
-
-						DqnV4 blend = {};
-						blend.r     = ((f32 *)&color1)[0];
-						blend.g     = ((f32 *)&color1)[1];
-						blend.b     = ((f32 *)&color1)[2];
-						blend.a     = ((f32 *)&color1)[3];
-						SetPixel(renderBuffer, posX, posY, blend, ColorSpace_Linear);
+						__m128 texSampledColor = SIMD_SampleTextureForTriangle(texture, uv1, uv2SubUv1, uv3SubUv1, barycentric);
+						__m128 finalColor      = _mm_mul_ps(texSampledColor, simdColor);
+						SIMD_SetPixel(renderBuffer, posX, posY, finalColor, ColorSpace_Linear);
 					}
 				}
+				signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX);
 			}

 			// Rasterise buffer(X + 1, Y) pixel
@ -1066,60 +1200,73 @@ inline void SIMDRasteriseTexturedTriangle(DTRRenderBuffer *const renderBuffer, c
 					if (pixelZValue > currZValue)
 					{
 						renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
-
-						f32 barycentricB = ((f32 *)&barycentric)[2];
-						f32 barycentricC = ((f32 *)&barycentric)[0];
-						DqnV2 uv = uv1 + (uv2SubUv1 * barycentricB) + (uv3SubUv1 * barycentricC);
-
-						const f32 EPSILON = 0.1f;
-						DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON);
-						DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON);
-						uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f);
-						uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f);
-
-						f32 texelXf = uv.x * texture->dim.w;
-						f32 texelYf = uv.y * texture->dim.h;
-						DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w);
-						DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h);
-
-						i32 texelX = (i32)texelXf;
-						i32 texelY = (i32)texelYf;
-
-						u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) + (texelY * texturePitch));
-
-						__m128 color1 = _mm_set_ps((f32)(texel1 >> 24),
-						                           (f32)((texel1 >> 16) & 0xFF),
-						                           (f32)((texel1 >> 8) & 0xFF),
-						                           (f32)((texel1 >> 0) & 0xFF));
-						color1              = _mm_mul_ps(color1, INV255_4X);
-						f32 preserveAlpha   = ((f32 *)&color1)[3];
-						color1              = _mm_mul_ps(color1, color1); // to linear space
-						((f32 *)&color1)[3] = preserveAlpha;
-						color1              = _mm_mul_ps(color1, colorModulate);
-
-						DqnV4 blend = {};
-						blend.r     = ((f32 *)&color1)[0];
-						blend.g     = ((f32 *)&color1)[1];
-						blend.b     = ((f32 *)&color1)[2];
-						blend.a     = ((f32 *)&color1)[3];
-						SetPixel(renderBuffer, posX, posY, blend, ColorSpace_Linear);
+						__m128 texSampledColor = SIMD_SampleTextureForTriangle(texture, uv1, uv2SubUv1, uv3SubUv1, barycentric);
+						__m128 finalColor      = _mm_mul_ps(texSampledColor, simdColor);
+						SIMD_SetPixel(renderBuffer, posX, posY, finalColor, ColorSpace_Linear);
 					}
 				}
+				signedArea2 = _mm_add_ps(signedArea2, signedAreaPixelDeltaX);
 			}

-			signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX);
-			signedArea2 = _mm_add_ps(signedArea2, signedAreaPixelDeltaX);
 		}

 		signedAreaPixel1 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaY);
 		signedAreaPixel2 = _mm_add_ps(signedAreaPixel2, signedAreaPixelDeltaY);
 	}
+	DTRDebug_EndCycleCount(DTRDebugCycleCount_RenderTexturedTriangle_Rasterise);
+
+	////////////////////////////////////////////////////////////////////////////
+	// Debug
+	////////////////////////////////////////////////////////////////////////////
+	DTRDebug_CounterIncrement(DTRDebugCounter_RenderTriangle);
+	if (DTR_DEBUG_RENDER)
+	{
+		DqnV2 origin = Get2DOriginFromTransformAnchor(p1.xy, p2.xy, p3.xy, transform);
+		// Draw Bounding box
+		if (0)
+		{
+			DTRRender_Line(renderBuffer, DqnV2i_2i(min.x, min.y), DqnV2i_2i(min.x, max.y), color);
+			DTRRender_Line(renderBuffer, DqnV2i_2i(min.x, max.y), DqnV2i_2i(max.x, max.y), color);
+			DTRRender_Line(renderBuffer, DqnV2i_2i(max.x, max.y), DqnV2i_2i(max.x, min.y), color);
+			DTRRender_Line(renderBuffer, DqnV2i_2i(max.x, min.y), DqnV2i_2i(min.x, min.y), color);
+		}
+
+		// Draw Triangle Coordinate Basis
+		if (0)
+		{
+			DqnV2 xAxis = DqnV2_2f(cosf(transform.rotation), sinf(transform.rotation)) * transform.scale.x;
+			DqnV2 yAxis         = DqnV2_2f(-xAxis.y, xAxis.x) * transform.scale.y;
+			DqnV4 coordSysColor = DqnV4_4f(0, 1, 1, 1);
+			i32 axisLen = 50;
+			DTRRender_Line(renderBuffer, DqnV2i_V2(origin), DqnV2i_V2(origin) + DqnV2i_V2(xAxis * axisLen), coordSysColor);
+			DTRRender_Line(renderBuffer, DqnV2i_V2(origin), DqnV2i_V2(origin) + DqnV2i_V2(yAxis * axisLen), coordSysColor);
+		}
+
+		// Draw axis point
+		if (0)
+		{
+			DqnV4 green  = DqnV4_4f(0, 1, 0, 1);
+			DqnV4 blue   = DqnV4_4f(0, 0, 1, 1);
+			DqnV4 purple = DqnV4_4f(1, 0, 1, 1);
+
+			DTRRender_Rectangle(renderBuffer, p1.xy - DqnV2_1f(5), p1.xy + DqnV2_1f(5), green);
+			DTRRender_Rectangle(renderBuffer, p2.xy - DqnV2_1f(5), p2.xy + DqnV2_1f(5), blue);
+			DTRRender_Rectangle(renderBuffer, p3.xy - DqnV2_1f(5), p3.xy + DqnV2_1f(5), purple);
+		}
+	}
 }

+
 void DTRRender_TexturedTriangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3,
                                DqnV2 uv1, DqnV2 uv2, DqnV2 uv3, DTRBitmap *const texture,
                                DqnV4 color, const DTRRenderTransform transform)
 {
+	if (globalDTRPlatformFlags.canUseSSE2)
+	{
+		SIMD_TexturedTriangle(renderBuffer, p1, p2, p3, uv1, uv2, uv3, texture, color, transform);
+		return;
+	}
+
 	DTR_DEBUG_EP_TIMED_FUNCTION();
 	////////////////////////////////////////////////////////////////////////////
 	// Transform vertexes
@ -1149,18 +1296,11 @@ void DTRRender_TexturedTriangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, D
 		DQN_SWAP(DqnV3, p2, p3);
 	}

-	DTRDebug_BeginCycleCount(DTRDebugCycleCount_RenderTexturedTriangle_Rasterise);
 	////////////////////////////////////////////////////////////////////////////
 	// Scan and Render
 	////////////////////////////////////////////////////////////////////////////
-	if (globalDTRPlatformFlags.canUseSSE2)
-	{
-		SIMDRasteriseTexturedTriangle(renderBuffer, p1, p2, p3, uv1, uv2, uv3, texture, color);
-	}
-	else
-	{
-		RasteriseTexturedTriangle(renderBuffer, p1, p2, p3, uv1, uv2, uv3, texture, color);
-	}
+	DTRDebug_BeginCycleCount(DTRDebugCycleCount_RenderTexturedTriangle_Rasterise);
+	RasteriseTexturedTriangle(renderBuffer, p1, p2, p3, uv1, uv2, uv3, texture, color);
 	DTRDebug_EndCycleCount(DTRDebugCycleCount_RenderTexturedTriangle_Rasterise);

 	////////////////////////////////////////////////////////////////////////////
@ -1250,12 +1390,14 @@ void DTRRender_Mesh(DTRRenderBuffer *const renderBuffer, DTRMesh *const mesh, co
 		// TODO(doyle): Why do we need rounding here? Maybe it's because
 		// I don't do any interpolation in the triangle routine for jagged
 		// edges.
+#if 1
 		screenVA.x = (f32)(i32)(screenVA.x + 0.5f);
 		screenVA.y = (f32)(i32)(screenVA.y + 0.5f);
 		screenVB.x = (f32)(i32)(screenVB.x + 0.5f);
 		screenVB.y = (f32)(i32)(screenVB.y + 0.5f);
 		screenVC.x = (f32)(i32)(screenVC.x + 0.5f);
 		screenVC.y = (f32)(i32)(screenVC.y + 0.5f);
+#endif

 		i32 textureAIndex = face.texIndex[0];
 		i32 textureBIndex = face.texIndex[1];
@ -1295,7 +1437,7 @@ void DTRRender_Mesh(DTRRenderBuffer *const renderBuffer, DTRMesh *const mesh, co

 FILE_SCOPE inline void SIMDRasteriseTriangle(DTRRenderBuffer *const renderBuffer,
                                             const SIMDTriangleInclusionTest simdTri,
-                                             const i32 posX, const i32 posY, const DqnV4 color,
+                                             const i32 posX, const i32 posY, DqnV4 color,
                                             __m128 *const signedArea)
 {
 	__m128 ZERO_4X         = _mm_set_ps1(0.0f);
@ -1316,7 +1458,52 @@ FILE_SCOPE inline void SIMDRasteriseTriangle(DTRRenderBuffer *const renderBuffer
 		if (pixelZValue > currZValue)
 		{
 			renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
+#if 1
+			// NOTE: Supersampling
+			const i32 NUM_SUB_PIXEL        = 8;
+			const f32 STEP_SIZE            = 1.0f / NUM_SUB_PIXEL;
+			const f32 COVERAGE_GRANULARITY = 1.0f / (f32)DQN_SQUARED(NUM_SUB_PIXEL);
+
+			f32 coverage = 0;
+			const __m128 STEP_SIZE_4X = _mm_set_ps1(STEP_SIZE);
+			const __m128 SUB_PIXEL_DELTA_X = _mm_mul_ps(simdTri.signedAreaPixelDeltaX, STEP_SIZE_4X);
+			const __m128 SUB_PIXEL_DELTA_Y = _mm_mul_ps(simdTri.signedAreaPixelDeltaY, STEP_SIZE_4X);
+
+			DqnV2 sample = DqnV2_2f(posX + (0.5f / NUM_SUB_PIXEL),
+			                        posY + (0.5f / NUM_SUB_PIXEL));
+			f32 resultP1 = Triangle2TimesSignedArea(simdTri.p2.xy, simdTri.p3.xy, sample);
+			f32 resultP2 = Triangle2TimesSignedArea(simdTri.p3.xy, simdTri.p1.xy, sample);
+			f32 resultP3 = Triangle2TimesSignedArea(simdTri.p1.xy, simdTri.p2.xy, sample);
+
+			__m128 startSubPixel = _mm_set_ps(0, resultP3, resultP2, resultP1);
+			__m128 checkPixel    = startSubPixel;
+			for (i32 subY = 0; subY < NUM_SUB_PIXEL; subY++)
+			{
+				checkPixel = startSubPixel;
+				for (i32 subX = 0; subX < NUM_SUB_PIXEL; subX++)
+				{
+					resultP1 = ((f32 *)&checkPixel)[0];
+					resultP2 = ((f32 *)&checkPixel)[1];
+					resultP3 = ((f32 *)&checkPixel)[2];
+					if ((resultP1 >= 0 && resultP2 >= 0 && resultP3 >= 0) ||
+					    (resultP1 < 0 && resultP2 < 0 && resultP3 < 0))
+					{
+						coverage += COVERAGE_GRANULARITY;
+					}
+
+					checkPixel = _mm_add_ps(checkPixel, SUB_PIXEL_DELTA_X);
+				}
+
+				startSubPixel = _mm_add_ps(startSubPixel, SUB_PIXEL_DELTA_Y);
+			}
+			if (coverage > 0)
+			{
+				color *= coverage;
+				SetPixel(renderBuffer, posX, posY, color, ColorSpace_Linear);
+			}
+#else
 			SetPixel(renderBuffer, posX, posY, color, ColorSpace_Linear);
+#endif
 		}
 	}
 	*signedArea = _mm_add_ps(*signedArea, simdTri.signedAreaPixelDeltaX);