From 4c5f8d43a025a6ffc68840308eca1ddcaedcc3ec Mon Sep 17 00:00:00 2001 From: Doyle Thai Date: Fri, 2 Jun 2017 18:23:15 +1000 Subject: [PATCH] Merge textured/non-textured tri rendering to one --- src/DTRenderer.cpp | 2 +- src/DTRendererDebug.cpp | 3 +- src/DTRendererDebug.h | 65 +++-- src/DTRendererRender.cpp | 569 ++++++++------------------------------- src/Win32DTRenderer.cpp | 2 +- 5 files changed, 156 insertions(+), 485 deletions(-) diff --git a/src/DTRenderer.cpp b/src/DTRenderer.cpp index 56aa7b5..f5b3ea6 100644 --- a/src/DTRenderer.cpp +++ b/src/DTRenderer.cpp @@ -952,7 +952,7 @@ extern "C" void DTR_Update(PlatformRenderBuffer *const platformRenderBuffer, globalDTRPlatformFlags = input->flags; if (globalDTRPlatformFlags.executableReloaded) { - DTR_DEBUG_EP_PROFILE_END(); + // DTR_DEBUG_EP_PROFILE_END(); DTR_DEBUG_EP_PROFILE_START(); } diff --git a/src/DTRendererDebug.cpp b/src/DTRendererDebug.cpp index 45d557e..f161b0f 100644 --- a/src/DTRendererDebug.cpp +++ b/src/DTRendererDebug.cpp @@ -240,8 +240,7 @@ void DTRDebug_Update(DTRState *const state, DTRDebug_PushText("%d:%s: %'lld avg cycles", i, cycles->name, avgCycles); } cycles->name = NULL; - - // *cycles = emptyDebugCycles; + // *cycles = emptyDebugCycles; } DTRDebug_PushText(""); diff --git a/src/DTRendererDebug.h b/src/DTRendererDebug.h index 346a19a..d87951c 100644 --- a/src/DTRendererDebug.h +++ b/src/DTRendererDebug.h @@ -5,33 +5,31 @@ // NOTE: When DTR_DEBUG is 0, _ALL_ debug code is compiled out. #define DTR_DEBUG 1 +#define DTR_DEBUG_RENDER 1 -#if DTR_DEBUG - #define DTR_DEBUG_RENDER 1 +// For inbuilt profiling DTRDebug_BeginCycleCount .. etc +#define DTR_DEBUG_PROFILING 1 - #define DTR_DEBUG_PROFILING_EASY_PROFILER 0 - #if DTR_DEBUG_PROFILING_EASY_PROFILER - #define BUILD_WITH_EASY_PROFILER 1 - #include "external/easy/profiler.h" - - #define DTR_DEBUG_EP_PROFILE_START() profiler::startListen() - #define DTR_DEBUG_EP_PROFILE_END() profiler::stopListen() - - #define DTR_DEBUG_EP_TIMED_BLOCK(name) EASY_BLOCK(name) - #define DTR_DEBUG_EP_TIMED_NONSCOPED_BLOCK(name) EASY_NONSCOPED_BLOCK(name) - #define DTR_DEBUG_EP_TIMED_END_BLOCK() EASY_END_BLOCK() - #define DTR_DEBUG_EP_TIMED_FUNCTION() EASY_FUNCTION() - #else - #define DTR_DEBUG_EP_PROFILE_START() - #define DTR_DEBUG_EP_PROFILE_END() - - #define DTR_DEBUG_EP_TIMED_BLOCK(name) - #define DTR_DEBUG_EP_TIMED_NONSCOPED_BLOCK(name) - #define DTR_DEBUG_EP_TIMED_END_BLOCK() - #define DTR_DEBUG_EP_TIMED_FUNCTION() - #endif - - #define DTR_DEBUG_PROFILING 1 +#define DTR_DEBUG_PROFILING_EASY_PROFILER 0 +#if DTR_DEBUG_PROFILING_EASY_PROFILER + #define BUILD_WITH_EASY_PROFILER 0 + #include "external/easy/profiler.h" + + #define DTR_DEBUG_EP_PROFILE_START() profiler::startListen() + #define DTR_DEBUG_EP_PROFILE_END() profiler::stopListen() + + #define DTR_DEBUG_EP_TIMED_BLOCK(name) EASY_BLOCK(name) + #define DTR_DEBUG_EP_TIMED_NONSCOPED_BLOCK(name) EASY_NONSCOPED_BLOCK(name) + #define DTR_DEBUG_EP_TIMED_END_BLOCK() EASY_END_BLOCK + #define DTR_DEBUG_EP_TIMED_FUNCTION() EASY_FUNCTION() +#else + #define DTR_DEBUG_EP_PROFILE_START() + #define DTR_DEBUG_EP_PROFILE_END() + + #define DTR_DEBUG_EP_TIMED_BLOCK(name) + #define DTR_DEBUG_EP_TIMED_NONSCOPED_BLOCK(name) + #define DTR_DEBUG_EP_TIMED_END_BLOCK() + #define DTR_DEBUG_EP_TIMED_FUNCTION() #endif enum DTRDebugCounter @@ -48,15 +46,32 @@ enum DTRDebugCycleCount DTRDebugCycleCount_DTR_Update_RenderPrimitiveTriangles, DTRDebugCycleCount_SIMDTexturedTriangle, + DTRDebugCycleCount_SIMDTexturedTriangle_Preamble, + DTRDebugCycleCount_SIMDTexturedTriangle_Preamble_SArea, + DTRDebugCycleCount_SIMDTexturedTriangle_Preamble_SIMDStep, DTRDebugCycleCount_SIMDTexturedTriangle_Rasterise, DTRDebugCycleCount_SIMDTexturedTriangle_RasterisePixel, DTRDebugCycleCount_SIMDTexturedTriangle_SampleTexture, DTRDebugCycleCount_SIMDTriangle, + DTRDebugCycleCount_SIMDTriangle_Preamble, + DTRDebugCycleCount_SIMDTriangle_Preamble_SArea, + DTRDebugCycleCount_SIMDTriangle_Preamble_SIMDStep, DTRDebugCycleCount_SIMDTriangle_Rasterise, DTRDebugCycleCount_SIMDTriangle_RasterisePixel, + DTRDebugCycleCount_SlowTexturedTriangle, + DTRDebugCycleCount_SlowTexturedTriangle_Preamble, + DTRDebugCycleCount_SlowTexturedTriangle_Preamble_SArea, + DTRDebugCycleCount_SlowTexturedTriangle_Preamble_SIMDStep, + DTRDebugCycleCount_SlowTexturedTriangle_Rasterise, + DTRDebugCycleCount_SlowTexturedTriangle_RasterisePixel, + DTRDebugCycleCount_SlowTexturedTriangle_SampleTexture, + DTRDebugCycleCount_SlowTriangle, + DTRDebugCycleCount_SlowTriangle_Preamble, + DTRDebugCycleCount_SlowTriangle_Preamble_SArea, + DTRDebugCycleCount_SlowTriangle_Preamble_SIMDStep, DTRDebugCycleCount_SlowTriangle_Rasterise, DTRDebugCycleCount_SlowTriangle_RasterisePixel, DTRDebugCycleCount_Count, diff --git a/src/DTRendererRender.cpp b/src/DTRendererRender.cpp index 913ac78..3e66395 100644 --- a/src/DTRendererRender.cpp +++ b/src/DTRendererRender.cpp @@ -760,8 +760,27 @@ FILE_SCOPE void SIMDTexturedTriangle(DTRRenderBuffer *const renderBuffer, const { DTR_DEBUG_EP_TIMED_FUNCTION(); - DTRDebug_BeginCycleCount("SIMDTexturedTriangle", DTRDebugCycleCount_SIMDTexturedTriangle); +#define DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(type) \ + do \ + { \ + if (texture) \ + DTRDebug_BeginCycleCount("SIMDTextured" #type, DTRDebugCycleCount_SIMDTextured##type); \ + else \ + DTRDebug_BeginCycleCount("SIMD" #type, DTRDebugCycleCount_SIMD##type); \ + } while (0) +#define DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(type) \ + do \ + { \ + if (texture) \ + DTRDebug_EndCycleCount(DTRDebugCycleCount_SIMDTextured##type); \ + else \ + DTRDebug_EndCycleCount(DTRDebugCycleCount_SIMD##type); \ + } while (0) + + DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle); + + DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_Preamble); //////////////////////////////////////////////////////////////////////////// // Convert color //////////////////////////////////////////////////////////////////////////// @@ -789,19 +808,39 @@ FILE_SCOPE void SIMDTexturedTriangle(DTRRenderBuffer *const renderBuffer, const __m128 triangleZ = _mm_set_ps(0, p3.z, p2.z, p1.z); { - DqnV2i startP = min; - f32 signedArea1Start = Triangle2TimesSignedArea(p2.xy, p3.xy, DqnV2_V2i(startP)); + DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_Preamble_SArea); + DTR_DEBUG_EP_TIMED_BLOCK("SIMDTexturedTriangle_Preamble_SArea"); + DqnV2 startP = DqnV2_V2i(min); +#if 1 + f32 signedArea1Start = Triangle2TimesSignedArea(p2.xy, p3.xy, startP); f32 signedArea1DeltaX = p2.y - p3.y; f32 signedArea1DeltaY = p3.x - p2.x; - f32 signedArea2Start = Triangle2TimesSignedArea(p3.xy, p1.xy, DqnV2_V2i(startP)); + f32 signedArea2Start = Triangle2TimesSignedArea(p3.xy, p1.xy, startP); f32 signedArea2DeltaX = p3.y - p1.y; f32 signedArea2DeltaY = p1.x - p3.x; - f32 signedArea3Start = Triangle2TimesSignedArea(p1.xy, p2.xy, DqnV2_V2i(startP)); + f32 signedArea3Start = Triangle2TimesSignedArea(p1.xy, p2.xy, startP); + f32 signedArea3DeltaX = p1.y - p2.y; + f32 signedArea3DeltaY = p2.x - p1.x; +#else + f32 signedArea1Start = ((p3.x - p2.x) * (startP.y - p2.y)) - ((p3.y - p2.y) * (startP.x - p2.x)); + f32 signedArea1DeltaX = p2.y - p3.y; + f32 signedArea1DeltaY = p3.x - p2.x; + + f32 signedArea2Start = ((p1.x - p3.x) * (startP.y - p3.y)) - ((p1.y - p3.y) * (startP.x - p3.x)); + f32 signedArea2DeltaX = p3.y - p1.y; + f32 signedArea2DeltaY = p1.x - p3.x; + + f32 signedArea3Start = ((p2.x - p1.x) * (startP.y - p1.y)) - ((p2.y - p1.y) * (startP.x - p1.x)); f32 signedArea3DeltaX = p1.y - p2.y; f32 signedArea3DeltaY = p2.x - p1.x; +#endif + DTR_DEBUG_EP_TIMED_END_BLOCK(); + DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Preamble_SArea); + + DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_Preamble_SIMDStep); f32 signedAreaParallelogram = signedArea1Start + signedArea2Start + signedArea3Start; if (signedAreaParallelogram == 0) return; @@ -823,18 +862,20 @@ FILE_SCOPE void SIMDTexturedTriangle(DTRRenderBuffer *const renderBuffer, const signedAreaPixelDeltaX = _mm_mul_ps(signedAreaPixelDeltaX, STEP_X_4X); signedAreaPixelDeltaY = _mm_mul_ps(signedAreaPixelDeltaY, STEP_Y_4X); } + DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Preamble_SIMDStep); } const DqnV2 uv2SubUv1 = uv2 - uv1; const DqnV2 uv3SubUv1 = uv3 - uv1; - const u32 texturePitch = texture->bytesPerPixel * texture->dim.w; - const u8 *const texturePtr = texture->memory; + const u32 texturePitch = (texture) ? (texture->bytesPerPixel * texture->dim.w) : 0; + const u8 *const texturePtr = (texture) ? (texture->memory) : NULL; const u32 zBufferPitch = renderBuffer->width; + DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Preamble); //////////////////////////////////////////////////////////////////////////// // Scan and Render //////////////////////////////////////////////////////////////////////////// - DTRDebug_BeginCycleCount("SIMDTexturedTriangle_Rasterise", DTRDebugCycleCount_SIMDTexturedTriangle_Rasterise); + DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_Rasterise); for (i32 bufferY = min.y; bufferY < max.y; bufferY += NUM_Y_PIXELS_TO_SIMD) { __m128 signedArea1 = signedAreaPixel1; @@ -843,8 +884,6 @@ FILE_SCOPE void SIMDTexturedTriangle(DTRRenderBuffer *const renderBuffer, const for (i32 bufferX = min.x; bufferX < max.x; bufferX += NUM_X_PIXELS_TO_SIMD) { - DTRDebug_BeginCycleCount("SIMDTexturedTriangle_RasterisePixel", - DTRDebugCycleCount_SIMDTexturedTriangle_RasterisePixel); // Rasterise buffer(X, Y) pixel { __m128 checkArea = signedArea1; @@ -855,6 +894,7 @@ FILE_SCOPE void SIMDTexturedTriangle(DTRRenderBuffer *const renderBuffer, const if ((isGreaterResult & IS_GREATER_MASK) == IS_GREATER_MASK) { + DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_RasterisePixel); __m128 barycentric = _mm_mul_ps(checkArea, invSignedAreaParallelogram_4x); __m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric); @@ -866,14 +906,19 @@ FILE_SCOPE void SIMDTexturedTriangle(DTRRenderBuffer *const renderBuffer, const if (pixelZValue > currZValue) { renderBuffer->zBuffer[zBufferIndex] = pixelZValue; - __m128 texSampledColor = SIMDSampleTextureForTriangle(texture, uv1, uv2SubUv1, uv3SubUv1, barycentric); - __m128 finalColor = _mm_mul_ps(texSampledColor, simdColor); + + __m128 finalColor = simdColor; + if (texture) + { + __m128 texSampledColor = SIMDSampleTextureForTriangle(texture, uv1, uv2SubUv1, uv3SubUv1, barycentric); + finalColor = _mm_mul_ps(texSampledColor, simdColor); + } SIMDSetPixel(renderBuffer, posX, posY, finalColor, ColorSpace_Linear); } + DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_RasterisePixel); } signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX); } - DTRDebug_EndCycleCount(DTRDebugCycleCount_SIMDTexturedTriangle_RasterisePixel); // Rasterise buffer(X + 1, Y) pixel { @@ -895,8 +940,13 @@ FILE_SCOPE void SIMDTexturedTriangle(DTRRenderBuffer *const renderBuffer, const if (pixelZValue > currZValue) { renderBuffer->zBuffer[zBufferIndex] = pixelZValue; - __m128 texSampledColor = SIMDSampleTextureForTriangle(texture, uv1, uv2SubUv1, uv3SubUv1, barycentric); - __m128 finalColor = _mm_mul_ps(texSampledColor, simdColor); + + __m128 finalColor = simdColor; + if (texture) + { + __m128 texSampledColor = SIMDSampleTextureForTriangle(texture, uv1, uv2SubUv1, uv3SubUv1, barycentric); + finalColor = _mm_mul_ps(texSampledColor, simdColor); + } SIMDSetPixel(renderBuffer, posX, posY, finalColor, ColorSpace_Linear); } } @@ -907,8 +957,8 @@ FILE_SCOPE void SIMDTexturedTriangle(DTRRenderBuffer *const renderBuffer, const signedAreaPixel1 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaY); signedAreaPixel2 = _mm_add_ps(signedAreaPixel2, signedAreaPixelDeltaY); } - DTRDebug_EndCycleCount(DTRDebugCycleCount_SIMDTexturedTriangle_Rasterise); - DTRDebug_EndCycleCount(DTRDebugCycleCount_SIMDTexturedTriangle); + DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Rasterise); + DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle); } FILE_SCOPE void SlowTexturedTriangle(DTRRenderBuffer *const renderBuffer, const DqnV3 p1, @@ -917,6 +967,28 @@ FILE_SCOPE void SlowTexturedTriangle(DTRRenderBuffer *const renderBuffer, const DqnV4 color, const DqnV2i min, const DqnV2i max) { DTR_DEBUG_EP_TIMED_FUNCTION(); + +#define DEBUG_SLOW_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(type) \ + do \ + { \ + if (texture) \ + DTRDebug_BeginCycleCount("SlowTextured" #type, DTRDebugCycleCount_SlowTextured##type); \ + else \ + DTRDebug_BeginCycleCount("Slow" #type, DTRDebugCycleCount_Slow##type); \ + } while (0) + +#define DEBUG_SLOW_AUTO_CHOOSE_END_CYCLE_COUNT(type) \ + do \ + { \ + if (texture) \ + DTRDebug_EndCycleCount(DTRDebugCycleCount_SlowTextured##type); \ + else \ + DTRDebug_EndCycleCount(DTRDebugCycleCount_Slow##type); \ + } while (0) + + DEBUG_SLOW_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle); + + DEBUG_SLOW_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_Preamble); //////////////////////////////////////////////////////////////////////////// // Convert Color //////////////////////////////////////////////////////////////////////////// @@ -926,6 +998,7 @@ FILE_SCOPE void SlowTexturedTriangle(DTRRenderBuffer *const renderBuffer, const //////////////////////////////////////////////////////////////////////////// // Scan and Render //////////////////////////////////////////////////////////////////////////// + DEBUG_SLOW_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_Preamble_SArea); DqnV2i startP = min; f32 signedArea1Pixel = Triangle2TimesSignedArea(p2.xy, p3.xy, DqnV2_V2i(startP)); f32 signedArea1DeltaX = p2.y - p3.y; @@ -938,22 +1011,25 @@ FILE_SCOPE void SlowTexturedTriangle(DTRRenderBuffer *const renderBuffer, const f32 signedArea3Pixel = Triangle2TimesSignedArea(p1.xy, p2.xy, DqnV2_V2i(startP)); f32 signedArea3DeltaX = p1.y - p2.y; f32 signedArea3DeltaY = p2.x - p1.x; + DEBUG_SLOW_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Preamble_SArea); + DEBUG_SLOW_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_Preamble_SIMDStep); f32 signedAreaParallelogram = signedArea1Pixel + signedArea2Pixel + signedArea3Pixel; if (signedAreaParallelogram == 0) return; f32 invSignedAreaParallelogram = 1.0f / signedAreaParallelogram; + DEBUG_SLOW_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Preamble_SIMDStep); - const DqnV3 p2SubP1 = p2 - p1; - const DqnV3 p3SubP1 = p3 - p1; - const DqnV2 uv2SubUv1 = uv2 - uv1; - const DqnV2 uv3SubUv1 = uv3 - uv1; - + const DqnV3 p2SubP1 = p2 - p1; + const DqnV3 p3SubP1 = p3 - p1; + const DqnV2 uv2SubUv1 = uv2 - uv1; + const DqnV2 uv3SubUv1 = uv3 - uv1; + const u32 texturePitch = (texture) ? (texture->bytesPerPixel * texture->dim.w) : 0; + const u8 *const texturePtr = (texture) ? (texture->memory) : NULL; const u32 zBufferPitch = renderBuffer->width; - const u8 *const texturePtr = texture->memory; - const u32 texturePitch = texture->bytesPerPixel * texture->dim.w; - - const f32 INV_255 = 1 / 255.0f; + const f32 INV_255 = 1 / 255.0f; + DEBUG_SLOW_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Preamble); + DEBUG_SLOW_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_Rasterise); for (i32 bufferY = min.y; bufferY < max.y; bufferY++) { f32 signedArea1 = signedArea1Pixel; @@ -964,32 +1040,9 @@ FILE_SCOPE void SlowTexturedTriangle(DTRRenderBuffer *const renderBuffer, const { if (signedArea1 >= 0 && signedArea2 >= 0 && signedArea3 >= 0) { - f32 barycentricB = signedArea3 * invSignedAreaParallelogram; - f32 barycentricC = signedArea1 * invSignedAreaParallelogram; - - if (DTR_DEBUG) - { - const f32 EPSILON = 0.1f; - f32 debugSignedArea1 = ((p2.x - p1.x) * (bufferY - p1.y)) - ((p2.y - p1.y) * (bufferX - p1.x)); - f32 debugSignedArea2 = ((p3.x - p2.x) * (bufferY - p2.y)) - ((p3.y - p2.y) * (bufferX - p2.x)); - f32 debugSignedArea3 = ((p1.x - p3.x) * (bufferY - p3.y)) - ((p1.y - p3.y) * (bufferX - p3.x)); - - f32 deltaSignedArea1 = DQN_ABS(debugSignedArea1 - signedArea1); - f32 deltaSignedArea2 = DQN_ABS(debugSignedArea2 - signedArea2); - f32 deltaSignedArea3 = DQN_ABS(debugSignedArea3 - signedArea3); - DQN_ASSERT(deltaSignedArea1 < EPSILON && deltaSignedArea2 < EPSILON && - deltaSignedArea3 < EPSILON) - - f32 debugBarycentricA, debugBarycentricB, debugBarycentricC; - DebugBarycentricInternal(DqnV2_2i(bufferX, bufferY), p1.xy, p2.xy, p3.xy, - &debugBarycentricA, &debugBarycentricB, - &debugBarycentricC); - - f32 deltaBaryB = DQN_ABS(barycentricB - debugBarycentricB); - f32 deltaBaryC = DQN_ABS(barycentricC - debugBarycentricC); - - DQN_ASSERT(deltaBaryB < EPSILON && deltaBaryC < EPSILON) - } + DEBUG_SLOW_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_RasterisePixel); + f32 barycentricB = signedArea2 * invSignedAreaParallelogram; + f32 barycentricC = signedArea3 * invSignedAreaParallelogram; i32 zBufferIndex = bufferX + (bufferY * zBufferPitch); f32 pixelZValue = p1.z + (barycentricB * (p2SubP1.z)) + (barycentricC * (p3SubP1.z)); @@ -1017,7 +1070,8 @@ FILE_SCOPE void SlowTexturedTriangle(DTRRenderBuffer *const renderBuffer, const i32 texelX = (i32)texelXf; i32 texelY = (i32)texelYf; - u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) + (texelY * texturePitch)); + u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) + + (texelY * texturePitch)); DqnV4 color1; color1.a = (f32)(texel1 >> 24); @@ -1034,6 +1088,7 @@ FILE_SCOPE void SlowTexturedTriangle(DTRRenderBuffer *const renderBuffer, const { SetPixel(renderBuffer, bufferX, bufferY, color, ColorSpace_Linear); } + DEBUG_SLOW_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_RasterisePixel); } } @@ -1046,346 +1101,10 @@ FILE_SCOPE void SlowTexturedTriangle(DTRRenderBuffer *const renderBuffer, const signedArea2Pixel += signedArea2DeltaY; signedArea3Pixel += signedArea3DeltaY; } + DEBUG_SLOW_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Rasterise); + DEBUG_SLOW_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle); } -FILE_SCOPE void SIMDTriangle(DTRRenderBuffer *const renderBuffer, const DqnV3 p1, const DqnV3 p2, - const DqnV3 p3, DqnV4 color) -{ - DTR_DEBUG_EP_TIMED_FUNCTION(); - DTRDebug_BeginCycleCount("SIMDTriangle", DTRDebugCycleCount_SIMDTriangle); - - //////////////////////////////////////////////////////////////////////////// - // Convert color - //////////////////////////////////////////////////////////////////////////// - __m128 simdColor = _mm_set_ps(color.a, color.b, color.g, color.r); - simdColor = SIMDSRGB1ToLinearSpace(simdColor); - simdColor = SIMDPreMultiplyAlpha1(simdColor); - - //////////////////////////////////////////////////////////////////////////// - // Render Bounds - //////////////////////////////////////////////////////////////////////////// - DqnV2i max = DqnV2i_2f(DQN_MAX(DQN_MAX(p1.x, p2.x), p3.x), DQN_MAX(DQN_MAX(p1.y, p2.y), p3.y)); - DqnV2i min = DqnV2i_2f(DQN_MIN(DQN_MIN(p1.x, p2.x), p3.x), DQN_MIN(DQN_MIN(p1.y, p2.y), p3.y)); - min.x = DQN_MAX(min.x, 0); - min.y = DQN_MAX(min.y, 0); - max.x = DQN_MIN(max.x, renderBuffer->width - 1); - max.y = DQN_MIN(max.y, renderBuffer->height - 1); - - //////////////////////////////////////////////////////////////////////////// - // Setup SIMD data - //////////////////////////////////////////////////////////////////////////// - const u32 NUM_X_PIXELS_TO_SIMD = 2; - const u32 NUM_Y_PIXELS_TO_SIMD = 1; - - const __m128 INV255_4X = _mm_set_ps1(1.0f / 255.0f); - const __m128 ZERO_4X = _mm_set_ps1(0.0f); - const u32 IS_GREATER_MASK = 0xF; - - // SignedArea: _mm_set_ps(unused, p3, p2, p1) ie 0=p1, 1=p1, 2=p3, 3=unused - __m128 signedAreaPixel1; - __m128 signedAreaPixel2; - - __m128 signedAreaPixelDeltaX; - __m128 signedAreaPixelDeltaY; - __m128 invSignedAreaParallelogram_4x; - - __m128 triangleZ = _mm_set_ps(0, p3.z, p2.z, p1.z); - { - DqnV2i startP = min; - f32 signedArea1Start = Triangle2TimesSignedArea(p2.xy, p3.xy, DqnV2_V2i(startP)); - f32 signedArea1DeltaX = p2.y - p3.y; - f32 signedArea1DeltaY = p3.x - p2.x; - - f32 signedArea2Start = Triangle2TimesSignedArea(p3.xy, p1.xy, DqnV2_V2i(startP)); - f32 signedArea2DeltaX = p3.y - p1.y; - f32 signedArea2DeltaY = p1.x - p3.x; - - f32 signedArea3Start = Triangle2TimesSignedArea(p1.xy, p2.xy, DqnV2_V2i(startP)); - f32 signedArea3DeltaX = p1.y - p2.y; - f32 signedArea3DeltaY = p2.x - p1.x; - - f32 signedAreaParallelogram = signedArea1Start + signedArea2Start + signedArea3Start; - if (signedAreaParallelogram == 0) return; - - f32 invSignedAreaParallelogram = 1.0f / signedAreaParallelogram; - invSignedAreaParallelogram_4x = _mm_set_ps1(invSignedAreaParallelogram); - - // NOTE: Order is important here! - signedAreaPixelDeltaX = _mm_set_ps(0, signedArea3DeltaX, signedArea2DeltaX, signedArea1DeltaX); - signedAreaPixelDeltaY = _mm_set_ps(0, signedArea3DeltaY, signedArea2DeltaY, signedArea1DeltaY); - - signedAreaPixel1 = _mm_set_ps(0, signedArea3Start, signedArea2Start, signedArea1Start); - signedAreaPixel2 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaX); - - // NOTE: Increase step size to the number of pixels rasterised with SIMD - { - const __m128 STEP_X_4X = _mm_set_ps1((f32)NUM_X_PIXELS_TO_SIMD); - const __m128 STEP_Y_4X = _mm_set_ps1((f32)NUM_Y_PIXELS_TO_SIMD); - - signedAreaPixelDeltaX = _mm_mul_ps(signedAreaPixelDeltaX, STEP_X_4X); - signedAreaPixelDeltaY = _mm_mul_ps(signedAreaPixelDeltaY, STEP_Y_4X); - } - } - - const u32 zBufferPitch = renderBuffer->width; - //////////////////////////////////////////////////////////////////////////// - // Scan and Render - //////////////////////////////////////////////////////////////////////////// - DTRDebug_BeginCycleCount("SIMDTriangle_Rasterise", DTRDebugCycleCount_SIMDTriangle_Rasterise); - for (i32 bufferY = min.y; bufferY < max.y; bufferY += NUM_Y_PIXELS_TO_SIMD) - { - __m128 signedArea1 = signedAreaPixel1; - __m128 signedArea2 = signedAreaPixel2; - - for (i32 bufferX = min.x; bufferX < max.x; bufferX += NUM_X_PIXELS_TO_SIMD) - { - - // Rasterise buffer(X, Y) pixel - { - __m128 checkArea = signedArea1; - __m128 isGreater = _mm_cmpge_ps(checkArea, ZERO_4X); - i32 isGreaterResult = _mm_movemask_ps(isGreater); - i32 posX = bufferX; - i32 posY = bufferY; - - if ((isGreaterResult & IS_GREATER_MASK) == IS_GREATER_MASK) - { - DTRDebug_BeginCycleCount("SIMDTriangle_RasterisePixel", - DTRDebugCycleCount_SIMDTriangle_RasterisePixel); - __m128 barycentric = _mm_mul_ps(checkArea, invSignedAreaParallelogram_4x); - __m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric); - - i32 zBufferIndex = posX + (posY * zBufferPitch); - f32 pixelZValue = ((f32 *)&barycentricZ)[0] + - ((f32 *)&barycentricZ)[1] + - ((f32 *)&barycentricZ)[2]; - f32 currZValue = renderBuffer->zBuffer[zBufferIndex]; - if (pixelZValue > currZValue) - { - renderBuffer->zBuffer[zBufferIndex] = pixelZValue; - SIMDSetPixel(renderBuffer, posX, posY, simdColor, ColorSpace_Linear); - } - DTRDebug_EndCycleCount(DTRDebugCycleCount_SIMDTriangle_RasterisePixel); - } - signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX); - } - - // Rasterise buffer(X + 1, Y) pixel - { - __m128 checkArea = signedArea2; - __m128 isGreater = _mm_cmpge_ps(checkArea, ZERO_4X); - i32 isGreaterResult = _mm_movemask_ps(isGreater); - i32 posX = bufferX + 1; - i32 posY = bufferY; - if ((isGreaterResult & IS_GREATER_MASK) == IS_GREATER_MASK && posX < max.x) - { - __m128 barycentric = _mm_mul_ps(checkArea, invSignedAreaParallelogram_4x); - __m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric); - - i32 zBufferIndex = posX + (posY * zBufferPitch); - f32 pixelZValue = ((f32 *)&barycentricZ)[0] + - ((f32 *)&barycentricZ)[1] + - ((f32 *)&barycentricZ)[2]; - f32 currZValue = renderBuffer->zBuffer[zBufferIndex]; - if (pixelZValue > currZValue) - { - renderBuffer->zBuffer[zBufferIndex] = pixelZValue; - SIMDSetPixel(renderBuffer, posX, posY, simdColor, ColorSpace_Linear); - } - } - signedArea2 = _mm_add_ps(signedArea2, signedAreaPixelDeltaX); - } - - } - - signedAreaPixel1 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaY); - signedAreaPixel2 = _mm_add_ps(signedAreaPixel2, signedAreaPixelDeltaY); - } - DTRDebug_EndCycleCount(DTRDebugCycleCount_SIMDTriangle_Rasterise); - DTRDebug_EndCycleCount(DTRDebugCycleCount_SIMDTriangle); -} - -FILE_SCOPE void SlowTriangle(DTRRenderBuffer *const renderBuffer, const DqnV3 p1, const DqnV3 p2, - const DqnV3 p3, DqnV4 color) -{ - DTRDebug_BeginCycleCount("SlowTriangle", DTRDebugCycleCount_SlowTriangle); - color = DTRRender_SRGB1ToLinearSpaceV4(color); - color = PreMultiplyAlpha1(color); - - DqnV2i max = DqnV2i_2f(DQN_MAX(DQN_MAX(p1.x, p2.x), p3.x), DQN_MAX(DQN_MAX(p1.y, p2.y), p3.y)); - DqnV2i min = DqnV2i_2f(DQN_MIN(DQN_MIN(p1.x, p2.x), p3.x), DQN_MIN(DQN_MIN(p1.y, p2.y), p3.y)); - min.x = DQN_MAX(min.x, 0); - min.y = DQN_MAX(min.y, 0); - max.x = DQN_MIN(max.x, renderBuffer->width - 1); - max.y = DQN_MIN(max.y, renderBuffer->height - 1); - - /* - ///////////////////////////////////////////////////////////////////////// - // Rearranging the Determinant - ///////////////////////////////////////////////////////////////////////// - Given two points that form a line and an extra point to test, we can - determine whether a point lies on the line, or is to the left or right of - a the line. - - We can do this using the PerpDotProduct conceptually known as the cross - product in 2D. This can be expressed using the determinant and is the - method we are using. - - First forming a 3x3 matrix of our terms with a, b being from the triangle - and test point c, we can derive a 2x2 matrix by subtracting the 1st - column from the 2nd and 1st column from the third. - - | ax bx cx | | (bx - ax) (cx - ax) | - m = | ay by cy | ==> | (by - ay) (cy - ay) | - | 1 1 1 | - - From our 2x2 representation we can calculate the determinant which gives - us the signed area of the triangle extended into a parallelogram. - - det(m) = (bx - ax)(cy - ay) - (by - ay)(cx - ax) - - Depending on the order of the vertices supplied, if it's - - CCW and c(x,y) is outside the line (triangle), the signed area is negative - - CCW and c(x,y) is inside the line (triangle), the signed area is positive - - CW and c(x,y) is outside the line (triangle), the signed area is positive - - CW and c(x,y) is inside the line (triangle), the signed area is negative - - ///////////////////////////////////////////////////////////////////////// - // Optimising the Determinant Calculation - ///////////////////////////////////////////////////////////////////////// - The det(m) can be rearranged if expanded to be - SignedArea(cx, cy) = (ay - by)cx + (bx - ay)cy + (ax*by - ay*bx) - - When we scan to fill our triangle we go pixel by pixel, left to right, - bottom to top, notice that this translates to +1 for x and +1 for y, i.e. - - The first pixel's signed area is cx, then cx+1, cx+2 .. etc - SignedArea(cx, cy) = (ay - by)cx + (bx - ax)cy + (ax*by - ay*bx) - SignedArea(cx+1, cy) = (ay - by)cx+1 + (bx - ax)cy + (ax*by - ay*bx) - - Then - SignedArea(cx+1, cy) - SignedArea(cx, cy) = - (ay - by)cx+1 + (bx - ax)cy + (ax*by - ay*bx) - - (ay - by)cx + (bx - ax)cy + (ax*by - ay*bx) - = (ay - by)cx+1 - (ay - by)cx - = (ay - by)(cx+1 - cx) - = (ay - by)(1) = (ay - by) - - Similarly when progressing in y - SignedArea(cx, cy) = (ay - by)cx + (bx - ay)cy + (ax*by - ay*bx) - SignedArea(cx, cy+1) = (ay - by)cx + (bx - ay)cy+1 + (ax*by - ay*bx) - - Then - SignedArea(cx, cy+1) - SignedArea(cx, cy) = - (ay - by)cx + (bx - ax)cy+1 + (ax*by - ay*bx) - - (ay - by)cx + (bx - ax)cy + (ax*by - ay*bx) - = (bx - ax)cy+1 - (bx - ax)cy - = (bx - ax)(cy+1 - cy) - = (bx - ax)(1) = (bx - ax) - - Then we can see that when we progress along x, we only need to change by - the value of SignedArea by (ay - by) and similarly for y, (bx - ax) - - ///////////////////////////////////////////////////////////////////////// - // Barycentric Coordinates - ///////////////////////////////////////////////////////////////////////// - At this point we have an equation that can be used to calculate the - 2x the signed area of a triangle, or the signed area of a parallelogram, - the two of which are equivalent. - - det(m) = (bx - ax)(cy - ay) - (by - ay)(cx - ax) - SignedArea(cx, cy) = (ay - by)cx + (bx - ay)cy + (ax*by - ay*bx) - - A barycentric coordinate is some coefficient on A, B, C that allows us to - specify an arbitrary point in the triangle as a linear combination of the - three usually with some coefficient [0, 1]. - - The SignedArea turns out to be actually the barycentric coord for c(x, y) - normalised to the sum of the parallelogram area. For example a triangle - with points, A, B, C and an arbitrary point P inside the triangle. Then - - SignedArea(P) with vertex A and B = Barycentric Coordinate for C - SignedArea(P) with vertex B and C = Barycentric Coordinate for A - SignedArea(P) with vertex C and A = Barycentric Coordinate for B - - B - / \ - / \ - / P \ - /_______\ - A C - - This is normalised to the area's sum, but we can trivially turn this into - a normalised version by dividing the area of the parallelogram, i.e. - - BaryCentricC(P) = (SignedArea(P) with vertex A and B)/SignedArea(with the orig triangle vertex) - BaryCentricA(P) = (SignedArea(P) with vertex B and C)/SignedArea(with the orig triangle vertex) - BaryCentricB(P) = (SignedArea(P) with vertex C and A)/SignedArea(with the orig triangle vertex) - */ - - DqnV2i startP = min; - f32 signedArea1Start = Triangle2TimesSignedArea(p2.xy, p3.xy, DqnV2_V2i(startP)); - f32 signedArea1DeltaX = p2.y - p3.y; - f32 signedArea1DeltaY = p3.x - p2.x; - - f32 signedArea2Start = Triangle2TimesSignedArea(p3.xy, p1.xy, DqnV2_V2i(startP)); - f32 signedArea2DeltaX = p3.y - p1.y; - f32 signedArea2DeltaY = p1.x - p3.x; - - f32 signedArea3Start = Triangle2TimesSignedArea(p1.xy, p2.xy, DqnV2_V2i(startP)); - f32 signedArea3DeltaX = p1.y - p2.y; - f32 signedArea3DeltaY = p2.x - p1.x; - - f32 signedAreaParallelogram = signedArea1Start + signedArea2Start + signedArea3Start; - if (signedAreaParallelogram == 0) return; - - f32 invSignedAreaParallelogram = 1.0f / signedAreaParallelogram; - - const DqnV3 p2SubP1 = p2 - p1; - const DqnV3 p3SubP1 = p3 - p1; - const u32 zBufferPitch = renderBuffer->width; - DTRDebug_BeginCycleCount("SlowTriangle_Rasterise", DTRDebugCycleCount_SlowTriangle_Rasterise); - for (i32 bufferY = min.y; bufferY < max.y; bufferY++) - { - f32 signedArea1 = signedArea1Start; - f32 signedArea2 = signedArea2Start; - f32 signedArea3 = signedArea3Start; - - for (i32 bufferX = min.x; bufferX < max.x; bufferX++) - { - if (signedArea1 >= 0 && signedArea2 >= 0 && signedArea3 >= 0) - { - DTRDebug_BeginCycleCount("SlowTriangle_RasterisePixel", - DTRDebugCycleCount_SlowTriangle_RasterisePixel); - f32 barycentricB = signedArea3 * invSignedAreaParallelogram; - f32 barycentricC = signedArea1 * invSignedAreaParallelogram; - - i32 zBufferIndex = bufferX + (bufferY * zBufferPitch); - f32 pixelZValue = p1.z + (barycentricB * (p2SubP1.z)) + (barycentricC * (p3SubP1.z)); - f32 currZValue = renderBuffer->zBuffer[zBufferIndex]; - DQN_ASSERT(zBufferIndex < (renderBuffer->width * renderBuffer->height)); - if (pixelZValue > currZValue) - { - renderBuffer->zBuffer[zBufferIndex] = pixelZValue; - SetPixel(renderBuffer, bufferX, bufferY, color, ColorSpace_Linear); - } - DTRDebug_EndCycleCount(DTRDebugCycleCount_SlowTriangle_RasterisePixel); - } - - signedArea1 += signedArea1DeltaX; - signedArea2 += signedArea2DeltaX; - signedArea3 += signedArea3DeltaX; - } - - signedArea1Start += signedArea1DeltaY; - signedArea2Start += signedArea2DeltaY; - signedArea3Start += signedArea3DeltaY; - } - DTRDebug_EndCycleCount(DTRDebugCycleCount_SlowTriangle_Rasterise); - DTRDebug_EndCycleCount(DTRDebugCycleCount_SlowTriangle); -} - - void DTRRender_TexturedTriangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3, DqnV2 uv1, DqnV2 uv2, DqnV2 uv3, DTRBitmap *const texture, DqnV4 color, const DTRRenderTransform transform) @@ -1413,27 +1132,13 @@ void DTRRender_TexturedTriangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, D //////////////////////////////////////////////////////////////////////////// // SIMD/Slow Path //////////////////////////////////////////////////////////////////////////// - if (texture) + if (globalDTRPlatformFlags.canUseSSE2 && 1) { - if (globalDTRPlatformFlags.canUseSSE2) - { - SIMDTexturedTriangle(renderBuffer, p1, p2, p3, uv1, uv2, uv3, texture, color, min, max); - } - else - { - SlowTexturedTriangle(renderBuffer, p1, p2, p3, uv1, uv2, uv3, texture, color, min, max); - } + SIMDTexturedTriangle(renderBuffer, p1, p2, p3, uv1, uv2, uv3, texture, color, min, max); } else { - if (globalDTRPlatformFlags.canUseSSE2) - { - SIMDTriangle(renderBuffer, p1, p2, p3, color); - } - else - { - SlowTriangle(renderBuffer, p1, p2, p3, color); - } + SlowTexturedTriangle(renderBuffer, p1, p2, p3, uv1, uv2, uv3, texture, color, min, max); } //////////////////////////////////////////////////////////////////////////// @@ -1536,56 +1241,8 @@ void DTRRender_Mesh(DTRRenderBuffer *const renderBuffer, DTRMesh *const mesh, co void DTRRender_Triangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3, DqnV4 color, const DTRRenderTransform transform) { -#if 0 - DTR_DEBUG_EP_TIMED_FUNCTION(); - //////////////////////////////////////////////////////////////////////////// - // Transform vertexes p1, p2, p3 inplace - //////////////////////////////////////////////////////////////////////////// - Make3PointsClockwise(&p1, &p2, &p3); - - // TODO(doyle): Transform is only in 2d right now - DqnV2 origin = Get2DOriginFromTransformAnchor(p1.xy, p2.xy, p3.xy, transform); - DqnV2 pList[3] = {p1.xy - origin, p2.xy - origin, p3.xy - origin}; - TransformPoints(origin, pList, DQN_ARRAY_COUNT(pList), transform.scale, transform.rotation); - - p1.xy = pList[0]; - p2.xy = pList[1]; - p3.xy = pList[2]; - - DqnRect bounds = GetBoundingBox(pList, DQN_ARRAY_COUNT(pList)); - DqnRect screenSpace = DqnRect_4i(0, 0, renderBuffer->width - 1, renderBuffer->height - 1); - bounds = DqnRect_ClipRect(bounds, screenSpace); - DqnV2i min = DqnV2i_V2(bounds.min); - DqnV2i max = DqnV2i_V2(bounds.max); - - //////////////////////////////////////////////////////////////////////////// - // SIMD/Slow Path - //////////////////////////////////////////////////////////////////////////// - if (globalDTRPlatformFlags.canUseSSE2) - { - SIMDTriangle(renderBuffer, p1, p2, p3, color); - } - else - { - SlowTriangle(renderBuffer, p1, p2, p3, color); - } - - //////////////////////////////////////////////////////////////////////////// - // Debug - //////////////////////////////////////////////////////////////////////////// - DTRDebug_CounterIncrement(DTRDebugCounter_RenderTriangle); - { - bool drawBoundingBox = true; - bool drawBasis = true; - bool drawVertexMarkers = true; - - DebugRenderMarkers(renderBuffer, pList, DQN_ARRAY_COUNT(pList), transform, drawBoundingBox, - drawBasis, drawVertexMarkers); - } -#else const DqnV2 noUV = {}; DTRRender_TexturedTriangle(renderBuffer, p1, p2, p3, noUV, noUV, noUV, NULL, color, transform); -#endif } void DTRRender_Bitmap(DTRRenderBuffer *const renderBuffer, DTRBitmap *const bitmap, DqnV2 pos, diff --git a/src/Win32DTRenderer.cpp b/src/Win32DTRenderer.cpp index a38007b..4f4e740 100644 --- a/src/Win32DTRenderer.cpp +++ b/src/Win32DTRenderer.cpp @@ -578,7 +578,7 @@ int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, //////////////////////////////////////////////////////////////////////// f64 startFrameTimeInS = DqnTime_NowInS(); - FILETIME lastWriteTime = Win32GetLastWriteTime(dllPath); + FILETIME lastWriteTime = Win32GetLastWriteTime(dllPath); if (CompareFileTime(&lastWriteTime, &dllCode.lastWriteTime) != 0) { Win32UnloadExternalDLL(&dllCode);