From 49270a282669354605377a223a3fdd1916dee405 Mon Sep 17 00:00:00 2001 From: Doyle Thai Date: Tue, 30 May 2017 17:41:05 +1000 Subject: [PATCH] Accelerate textured triangle rendering using SIMD --- src/DTRenderer.cpp | 4 +- src/DTRendererAsset.cpp | 60 ++++--- src/DTRendererDebug.cpp | 18 +- src/DTRendererDebug.h | 14 +- src/DTRendererRender.cpp | 354 ++++++++++++++++++++++++++++----------- src/DTRendererRender.h | 2 +- src/build.bat | 4 +- 7 files changed, 324 insertions(+), 132 deletions(-) diff --git a/src/DTRenderer.cpp b/src/DTRenderer.cpp index 6f9c813..329e05f 100644 --- a/src/DTRenderer.cpp +++ b/src/DTRenderer.cpp @@ -1104,8 +1104,8 @@ extern "C" void DTR_Update(PlatformRenderBuffer *const platformRenderBuffer, } else { - DTRRender_TexturedTriangle(&renderBuffer, screenVA, screenVB, screenVC, texA, texB, - texC, &state->mesh.tex, modelCol); + DTRRender_TexturedTriangle(input, &renderBuffer, screenVA, screenVB, screenVC, texA, + texB, texC, &state->mesh.tex, modelCol); } bool DEBUG_WIREFRAME = false; diff --git a/src/DTRendererAsset.cpp b/src/DTRendererAsset.cpp index 1cdb798..062ab7b 100644 --- a/src/DTRendererAsset.cpp +++ b/src/DTRendererAsset.cpp @@ -17,14 +17,23 @@ void DTRAsset_InitGlobalState() stbi_set_flip_vertically_on_load(true); } -FILE_SCOPE void MemcopyInternal(u8 *dest, u8 *src, size_t numBytes) +FILE_SCOPE void MemcopyInternal(u8 *const dest, u8 *const src, size_t numBytes) { if (!dest || !src || numBytes == 0) return; for (size_t i = 0; i < numBytes; i++) dest[i] = src[i]; } -FILE_SCOPE void AssetDqnArrayMemAPICallback(DqnMemAPICallbackInfo info, DqnMemAPICallbackResult *result) +// NOTE: Dynamic array allocations just requests space at the first option it +// can take. Realloc will reallocate in place if there's space. Otherwise +// it'll create a new block and reallocate there by copying the old data over. + +// So this does waste space. But is a quick way to reroute allocations into +// a MemStack. It's main intended purpose is for one-shot loading data that you +// don't know how much space you need in your DArray. After filling out +// the dynamic array you then compact the data manually using memcopys into +// a new block and discard the old data. +FILE_SCOPE void DumbDynamicArrayMemAPICallback(DqnMemAPICallbackInfo info, DqnMemAPICallbackResult *result) { DQN_ASSERT(info.type != DqnMemAPICallbackType_Invalid); DqnMemStack *stack = static_cast(info.userContext); @@ -40,27 +49,7 @@ FILE_SCOPE void AssetDqnArrayMemAPICallback(DqnMemAPICallbackInfo info, DqnMemAP case DqnMemAPICallbackType_Free: { - DqnMemStackBlock **blockPtr = &stack->block; - while (*blockPtr && (*blockPtr)->memory != info.ptrToFree) - { - // NOTE(doyle): Ensure that the base ptr of each block is always - // actually aligned so we don't ever miss finding the block if - // the allocator had to realign the pointer from the base - // address. - if (DTR_DEBUG) - { - size_t memBaseAddr = (size_t)((*blockPtr)->memory); - DQN_ASSERT(DQN_ALIGN_POW_N(memBaseAddr, stack->byteAlign) == - memBaseAddr); - } - blockPtr = &((*blockPtr)->prevBlock); - } - - DQN_ASSERT(*blockPtr && (*blockPtr)->memory == info.ptrToFree); - DqnMemStackBlock *blockToFree = *blockPtr; - *blockPtr = blockToFree->prevBlock; - DqnMem_Free(blockToFree); - + DQN_ASSERT(DQN_INVALID_CODE_PATH); } break; @@ -189,7 +178,7 @@ bool DTRAsset_LoadWavefrontObj(const PlatformAPI api, DqnMemStack *const memStac size_t fileSize = file.size; DqnMemAPI memAPI = {}; - memAPI.callback = AssetDqnArrayMemAPICallback; + memAPI.callback = DumbDynamicArrayMemAPICallback; memAPI.userContext = memStack; enum WavefVertexType { @@ -199,6 +188,23 @@ bool DTRAsset_LoadWavefrontObj(const PlatformAPI api, DqnMemStack *const memStac WavefVertexType_Normal, }; + // TODO(doyle): We should profile, reading it out to WavefModel format and + // then copying it over, versus just reading the file twice. First pass is + // to count the number of vertexes etc. for each section we need. Then the + // second pass we can allocate directly the number we need and reparse it. + // I have a feeling that, in general that's a better idea, atleast it gets + // rid of alot of stupid copying code and memstack juggling. + + // NOTE(doyle): We pre-process the data into an intermediate format that + // more accurately represents the file format. Since there's no metadata + // inside Wavefront objects, we don't know how many vertexes/texUV/normals + // there are- which makes it hard to allocate "nicely" out of our memory + // stack. + + // So we preprocess. Then once we know the final amount, copy over the data + // to a new memstack block such that all the data is compacted together in + // memory for locality. Then just throw away the intermediate + // representation. WavefModel dummy_ = {}; WavefModel *obj = &dummy_; @@ -410,11 +416,17 @@ bool DTRAsset_LoadWavefrontObj(const PlatformAPI api, DqnMemStack *const memStac DQN_ASSERT(obj->groupNameIndex + 1 < DQN_ARRAY_COUNT(obj->groupName)); DQN_ASSERT(!obj->groupName[obj->groupNameIndex]); + // TODO(doyle): Broken since I don't "copy" it over to our + // final DTRMesh. Below I copy over the data so that all the + // allocations are compacted together but don't copy this + // yet. Which means the name gets trashed atm. +#if 0 obj->groupName[obj->groupNameIndex++] = (char *)DqnMemStack_Push(memStack, (nameLen + 1) * sizeof(char)); for (i32 i = 0; i < nameLen; i++) obj->groupName[obj->groupNameIndex - 1][i] = namePtr[i]; +#endif while (scan && (*scan == ' ' || *scan == '\n')) scan++; diff --git a/src/DTRendererDebug.cpp b/src/DTRendererDebug.cpp index c73693a..abcd50b 100644 --- a/src/DTRendererDebug.cpp +++ b/src/DTRendererDebug.cpp @@ -121,7 +121,9 @@ void inline DTRDebug_BeginCycleCount(enum DTRDebugCycleCount tag) { if (globalDebug.input && globalDebug.input->canUseRdtsc) { - globalDebug.cycleCount[tag] = __rdtsc(); + DTRDebugCycles *const cycles = &globalDebug.cycles[tag]; + cycles->tmpStartCycles = __rdtsc(); + cycles->numInvokes++; } } } @@ -132,7 +134,8 @@ void inline DTRDebug_EndCycleCount(enum DTRDebugCycleCount tag) { if (globalDebug.input && globalDebug.input->canUseRdtsc) { - globalDebug.cycleCount[tag] = __rdtsc() - globalDebug.cycleCount[tag]; + DTRDebugCycles *const cycles = &globalDebug.cycles[tag]; + cycles->totalCycles += __rdtsc() - cycles->tmpStartCycles; } } } @@ -214,9 +217,16 @@ void DTRDebug_Update(DTRState *const state, DTRDebug_PushText("TrianglesRendered: %'lld", debug->counter[DTRDebugCounter_RenderTriangle]); DTRDebug_PushText(""); - for (i32 i = 0; i < DQN_ARRAY_COUNT(debug->cycleCount); i++) + DTRDebugCycles emptyDebugCycles = {}; + for (i32 i = 0; i < DQN_ARRAY_COUNT(debug->cycles); i++) { - DTRDebug_PushText("%d: %'lld cycles", i, debug->cycleCount[i]); + DTRDebugCycles *const cycles = &globalDebug.cycles[i]; + + u64 invocations = (cycles->numInvokes == 0) ? 1 : cycles->numInvokes; + u64 avgCycles = cycles->totalCycles / invocations; + DTRDebug_PushText("%d: %'lld avg cycles", i, avgCycles); + + *cycles = emptyDebugCycles; } DTRDebug_PushText(""); diff --git a/src/DTRendererDebug.h b/src/DTRendererDebug.h index 89ea88a..1c4194f 100644 --- a/src/DTRendererDebug.h +++ b/src/DTRendererDebug.h @@ -47,6 +47,14 @@ enum DTRDebugCycleCount DTRDebugCycleCount_Count, }; +typedef struct DTRDebugCycles +{ + u64 totalCycles; + u64 numInvokes; + + u64 tmpStartCycles; // Used to calculate the number of cycles elapsed +} DTRDebugCycles; + typedef struct DTRDebug { struct DTRFont *font; @@ -57,9 +65,9 @@ typedef struct DTRDebug DqnV2 displayP; i32 displayYOffset; - u64 cycleCount[DTRDebugCycleCount_Count]; - u64 counter [DTRDebugCounter_Count]; - u64 totalSetPixels; + DTRDebugCycles cycles [DTRDebugCycleCount_Count]; + u64 counter[DTRDebugCounter_Count]; + u64 totalSetPixels; } DTRDebug; extern DTRDebug globalDebug; diff --git a/src/DTRendererRender.cpp b/src/DTRendererRender.cpp index 1311adb..c2c2c7a 100644 --- a/src/DTRendererRender.cpp +++ b/src/DTRendererRender.cpp @@ -7,6 +7,8 @@ #include "external/stb_rect_pack.h" #include "external/stb_truetype.h" +#include + FILE_SCOPE const f32 COLOR_EPSILON = 0.9f; FILE_SCOPE inline DqnV4 PreMultiplyAlpha1(const DqnV4 color) @@ -497,7 +499,8 @@ FILE_SCOPE void DebugBarycentricInternal(DqnV2 p, DqnV2 a, DqnV2 b, DqnV2 c, f32 *u = 1.0f - *v - *w; } -void DTRRender_TexturedTriangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3, +void DTRRender_TexturedTriangle(PlatformInput *const input, + DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3, DqnV2 uv1, DqnV2 uv2, DqnV2 uv3, DTRBitmap *const texture, DqnV4 color, const DTRRenderTransform transform) { @@ -549,125 +552,287 @@ void DTRRender_TexturedTriangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, D const DqnV3 b = p2; const DqnV3 c = p3; - DqnV2i startP = min; - f32 oldSignedArea1 = ((b.x - a.x) * (startP.y - a.y)) - ((b.y - a.y) * (startP.x - a.x)); - f32 oldSignedArea2 = ((c.x - b.x) * (startP.y - b.y)) - ((c.y - b.y) * (startP.x - b.x)); - f32 oldSignedArea3 = ((a.x - c.x) * (startP.y - c.y)) - ((a.y - c.y) * (startP.x - c.x)); - - f32 signedArea1 = ((b.x - a.x) * (startP.y - a.y)) - ((b.y - a.y) * (startP.x - a.x)); - f32 signedArea1DeltaX = a.y - b.y; - f32 signedArea1DeltaY = b.x - a.x; - - f32 signedArea2 = ((c.x - b.x) * (startP.y - b.y)) - ((c.y - b.y) * (startP.x - b.x)); - f32 signedArea2DeltaX = b.y - c.y; - f32 signedArea2DeltaY = c.x - b.x; - - f32 signedArea3 = ((a.x - c.x) * (startP.y - c.y)) - ((a.y - c.y) * (startP.x - c.x)); - f32 signedArea3DeltaX = c.y - a.y; - f32 signedArea3DeltaY = a.x - c.x; - - f32 signedAreaParallelogram = signedArea1 + signedArea2 + signedArea3; - if (signedAreaParallelogram == 0) return; - f32 invSignedAreaParallelogram = 1 / signedAreaParallelogram; - DTRDebug_BeginCycleCount(DTRDebugCycleCount_RenderTriangle_Rasterise); //////////////////////////////////////////////////////////////////////////// // Scan and Render //////////////////////////////////////////////////////////////////////////// - const u32 zBufferPitch = renderBuffer->width; - const f32 BARYCENTRIC_EPSILON = 0.1f; - - u8 *texturePtr = texture->memory; - const u32 texturePitch = texture->bytesPerPixel * texture->dim.w; - for (i32 bufferY = min.y; bufferY < max.y; bufferY++) + const u32 zBufferPitch = renderBuffer->width; + if (input->canUseSSE2) { - f32 signedArea1Row = signedArea1; - f32 signedArea2Row = signedArea2; - f32 signedArea3Row = signedArea3; + DqnV2i startP = min; + f32 edge1SignedAreaPixel1 = ((b.x - a.x) * (startP.y - a.y)) - ((b.y - a.y) * (startP.x - a.x)); + f32 edge1SignedAreaPixel1DeltaX = a.y - b.y; + f32 edge1SignedAreaPixel1DeltaY = b.x - a.x; - for (i32 bufferX = min.x; bufferX < max.x; bufferX++) + f32 edge2SignedAreaPixel1 = ((c.x - b.x) * (startP.y - b.y)) - ((c.y - b.y) * (startP.x - b.x)); + f32 edge2SignedAreaPixel1DeltaX = b.y - c.y; + f32 edge2SignedAreaPixel1DeltaY = c.x - b.x; + + f32 edge3SignedAreaPixel1 = ((a.x - c.x) * (startP.y - c.y)) - ((a.y - c.y) * (startP.x - c.x)); + f32 edge3SignedAreaPixel1DeltaX = c.y - a.y; + f32 edge3SignedAreaPixel1DeltaY = a.x - c.x; + + f32 signedAreaParallelogramPixel1 = edge1SignedAreaPixel1 + edge2SignedAreaPixel1 + edge3SignedAreaPixel1; + if (signedAreaParallelogramPixel1 == 0) return; + f32 invSignedAreaParallelogramPixel1 = 1 / signedAreaParallelogramPixel1; + + __m128 zero_4x = _mm_set_ps1(0.0f); + __m128 two_4x = _mm_set_ps1(2.0f); + __m128 invSignedAreaParallelogram4x = _mm_set_ps1(invSignedAreaParallelogramPixel1); + __m128 triangleZ = _mm_set_ps(0, b.z, a.z, c.z); + + __m128 signedAreaPixelDeltaX = _mm_set_ps(0, edge3SignedAreaPixel1DeltaX, edge2SignedAreaPixel1DeltaX, edge1SignedAreaPixel1DeltaX); + __m128 signedAreaPixelDeltaY = _mm_set_ps(0, edge3SignedAreaPixel1DeltaY, edge2SignedAreaPixel1DeltaY, edge1SignedAreaPixel1DeltaY); + + __m128 signedAreaPixel1 = _mm_set_ps(0, edge3SignedAreaPixel1, edge2SignedAreaPixel1, edge1SignedAreaPixel1); + __m128 signedAreaPixel2 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaX); + + // NOTE: Step size of 2 pixels across X + signedAreaPixelDeltaX = _mm_mul_ps(signedAreaPixelDeltaX, two_4x); + + const DqnV2 uv2SubUv1 = uv2 - uv1; + const DqnV2 uv3SubUv1 = uv3 - uv1; + + const u32 IS_GREATER_MASK = 0xF; + + for (i32 bufferY = min.y; bufferY < max.y; bufferY++) { - if (signedArea1Row >= 0 && signedArea2Row >= 0 && signedArea3Row >= 0) + __m128 signedArea1 = signedAreaPixel1; + __m128 signedArea2 = signedAreaPixel2; + + for (i32 bufferX = min.x; bufferX < max.x; bufferX += 2) { - f32 barycentricB = signedArea3Row * invSignedAreaParallelogram; - f32 barycentricC = signedArea1Row * invSignedAreaParallelogram; - - if (DTR_DEBUG) + __m128 isGreater1 = _mm_cmpge_ps(signedArea1, zero_4x); + i32 isGreaterResult1 = _mm_movemask_ps(isGreater1); + if ((isGreaterResult1 & IS_GREATER_MASK) == IS_GREATER_MASK) { - const f32 EPSILON = 0.1f; + __m128 barycentric = _mm_mul_ps(signedArea1, invSignedAreaParallelogram4x); + __m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric); - f32 debugSignedArea1 = ((b.x - a.x) * (bufferY - a.y)) - ((b.y - a.y) * (bufferX - a.x)); - f32 debugSignedArea2 = ((c.x - b.x) * (bufferY - b.y)) - ((c.y - b.y) * (bufferX - b.x)); - f32 debugSignedArea3 = ((a.x - c.x) * (bufferY - c.y)) - ((a.y - c.y) * (bufferX - c.x)); + i32 zBufferIndex = bufferX + (bufferY * zBufferPitch); + f32 pixelZValue = ((f32 *)&barycentricZ)[0] + + ((f32 *)&barycentricZ)[1] + + ((f32 *)&barycentricZ)[2]; + f32 currZValue = renderBuffer->zBuffer[zBufferIndex]; + if (pixelZValue > currZValue) + { + renderBuffer->zBuffer[zBufferIndex] = pixelZValue; + u8 *texturePtr = texture->memory; + const u32 texturePitch = texture->bytesPerPixel * texture->dim.w; - f32 deltaSignedArea1 = debugSignedArea1 - signedArea1Row; - f32 deltaSignedArea2 = debugSignedArea2 - signedArea2Row; - f32 deltaSignedArea3 = debugSignedArea3 - signedArea3Row; - DQN_ASSERT(deltaSignedArea1 < EPSILON && deltaSignedArea2 < EPSILON && - deltaSignedArea3 < EPSILON) + f32 barycentricB = ((f32 *)&barycentric)[2]; + f32 barycentricC = ((f32 *)&barycentric)[0]; + DqnV2 uv = uv1 + (uv2SubUv1 * barycentricB) + (uv3SubUv1 * barycentricC); - f32 debugBarycentricA, debugBarycentricB, debugBarycentricC; - DebugBarycentricInternal(DqnV2_2i(bufferX, bufferY), a.xy, b.xy, c.xy, - &debugBarycentricA, &debugBarycentricB, - &debugBarycentricC); + const f32 EPSILON = 0.1f; + DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON); + DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON); + uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f); + uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f); + f32 texelXf = uv.x * texture->dim.w; + f32 texelYf = uv.y * texture->dim.h; + DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w); + DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h); - f32 deltaBaryB = DQN_ABS(barycentricB - debugBarycentricB); - f32 deltaBaryC = DQN_ABS(barycentricC - debugBarycentricC); + i32 texelX = (i32)texelXf; + i32 texelY = (i32)texelYf; + + u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) + + (texelY * texturePitch)); + + DqnV4 color1; + color1.a = (f32)(texel1 >> 24); + color1.b = (f32)((texel1 >> 16) & 0xFF); + color1.g = (f32)((texel1 >> 8) & 0xFF); + color1.r = (f32)((texel1 >> 0) & 0xFF); + color1 *= DTRRENDER_INV_255; + color1 = DTRRender_SRGB1ToLinearSpaceV4(color1); + DqnV4 blend = color * color1; + SetPixel(renderBuffer, bufferX, bufferY, blend, ColorSpace_Linear); + } - DQN_ASSERT(deltaBaryB < EPSILON && deltaBaryC < EPSILON) } - i32 zBufferIndex = bufferX + (bufferY * zBufferPitch); - f32 pixelZValue = a.z + (barycentricB * (b.z - a.z)) + (barycentricC * (c.z - a.z)); - f32 currZValue = renderBuffer->zBuffer[zBufferIndex]; - DQN_ASSERT(zBufferIndex < (renderBuffer->width * renderBuffer->height)); - - if (pixelZValue > currZValue) + __m128 isGreater2 = _mm_cmpge_ps(signedArea2, zero_4x); + i32 isGreaterResult2 = _mm_movemask_ps(isGreater2); + i32 bufferX1 = bufferX + 1; + if ((isGreaterResult2 & IS_GREATER_MASK) == IS_GREATER_MASK && bufferX1 < max.x) { - renderBuffer->zBuffer[zBufferIndex] = pixelZValue; - const bool DEBUG_SAMPLE_TEXTURE = true; - DqnV2 uv = uv1 + ((uv2 - uv1) * barycentricB) + ((uv3 - uv1) * barycentricC); + __m128 barycentric = _mm_mul_ps(signedArea2, invSignedAreaParallelogram4x); + __m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric); - const f32 EPSILON = 0.1f; - DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON); - DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON); + i32 zBufferIndex = bufferX1 + (bufferY * zBufferPitch); + f32 pixelZValue = ((f32 *)&barycentricZ)[0] + + ((f32 *)&barycentricZ)[1] + + ((f32 *)&barycentricZ)[2]; + f32 currZValue = renderBuffer->zBuffer[zBufferIndex]; + if (pixelZValue > currZValue) + { + renderBuffer->zBuffer[zBufferIndex] = pixelZValue; + u8 *texturePtr = texture->memory; + const u32 texturePitch = texture->bytesPerPixel * texture->dim.w; - uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f); - uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f); + f32 barycentricB = ((f32 *)&barycentric)[2]; + f32 barycentricC = ((f32 *)&barycentric)[0]; + DqnV2 uv = uv1 + (uv2SubUv1 * barycentricB) + (uv3SubUv1 * barycentricC); - f32 texelXf = uv.x * texture->dim.w; - f32 texelYf = uv.y * texture->dim.h; - DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w); - DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h); + const f32 EPSILON = 0.1f; + DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON); + DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON); + uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f); + uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f); - i32 texelX = (i32)texelXf; - i32 texelY = (i32)texelYf; + f32 texelXf = uv.x * texture->dim.w; + f32 texelYf = uv.y * texture->dim.h; + DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w); + DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h); - u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) + - (texelY * texturePitch)); + i32 texelX = (i32)texelXf; + i32 texelY = (i32)texelYf; - DqnV4 color1; - color1.a = (f32)(texel1 >> 24); - color1.b = (f32)((texel1 >> 16) & 0xFF); - color1.g = (f32)((texel1 >> 8) & 0xFF); - color1.r = (f32)((texel1 >> 0) & 0xFF); + u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) + + (texelY * texturePitch)); - color1 *= DTRRENDER_INV_255; - color1 = DTRRender_SRGB1ToLinearSpaceV4(color1); - DqnV4 blend = color * color1; - SetPixel(renderBuffer, bufferX, bufferY, blend, ColorSpace_Linear); + DqnV4 color1; + color1.a = (f32)(texel1 >> 24); + color1.b = (f32)((texel1 >> 16) & 0xFF); + color1.g = (f32)((texel1 >> 8) & 0xFF); + color1.r = (f32)((texel1 >> 0) & 0xFF); + color1 *= DTRRENDER_INV_255; + color1 = DTRRender_SRGB1ToLinearSpaceV4(color1); + DqnV4 blend = color * color1; + SetPixel(renderBuffer, bufferX1, bufferY, blend, ColorSpace_Linear); + } } + + signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX); + signedArea2 = _mm_add_ps(signedArea2, signedAreaPixelDeltaX); } - signedArea1Row += signedArea1DeltaX; - signedArea2Row += signedArea2DeltaX; - signedArea3Row += signedArea3DeltaX; + signedAreaPixel1 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaY); + signedAreaPixel2 = _mm_add_ps(signedAreaPixel2, signedAreaPixelDeltaY); } + } + else + { + DqnV2i startP = min; + f32 signedArea1 = ((b.x - a.x) * (startP.y - a.y)) - ((b.y - a.y) * (startP.x - a.x)); + f32 signedArea1DeltaX = a.y - b.y; + f32 signedArea1DeltaY = b.x - a.x; - signedArea1 += signedArea1DeltaY; - signedArea2 += signedArea2DeltaY; - signedArea3 += signedArea3DeltaY; + f32 signedArea2 = ((c.x - b.x) * (startP.y - b.y)) - ((c.y - b.y) * (startP.x - b.x)); + f32 signedArea2DeltaX = b.y - c.y; + f32 signedArea2DeltaY = c.x - b.x; + + f32 signedArea3 = ((a.x - c.x) * (startP.y - c.y)) - ((a.y - c.y) * (startP.x - c.x)); + f32 signedArea3DeltaX = c.y - a.y; + f32 signedArea3DeltaY = a.x - c.x; + + f32 signedAreaParallelogram = signedArea1 + signedArea2 + signedArea3; + if (signedAreaParallelogram == 0) return; + f32 invSignedAreaParallelogram = 1 / signedAreaParallelogram; + + for (i32 bufferY = min.y; bufferY < max.y; bufferY++) + { + f32 signedArea1Row = signedArea1; + f32 signedArea2Row = signedArea2; + f32 signedArea3Row = signedArea3; + + for (i32 bufferX = min.x; bufferX < max.x; bufferX++) + { + if (signedArea1Row >= 0 && signedArea2Row >= 0 && signedArea3Row >= 0) + { + f32 barycentricB = signedArea3Row * invSignedAreaParallelogram; + f32 barycentricC = signedArea1Row * invSignedAreaParallelogram; + + if (DTR_DEBUG) + { + const f32 EPSILON = 0.1f; + + f32 debugSignedArea1 = ((b.x - a.x) * (bufferY - a.y)) - ((b.y - a.y) * (bufferX - a.x)); + f32 debugSignedArea2 = ((c.x - b.x) * (bufferY - b.y)) - ((c.y - b.y) * (bufferX - b.x)); + f32 debugSignedArea3 = ((a.x - c.x) * (bufferY - c.y)) - ((a.y - c.y) * (bufferX - c.x)); + + f32 deltaSignedArea1 = DQN_ABS(debugSignedArea1 - signedArea1Row); + f32 deltaSignedArea2 = DQN_ABS(debugSignedArea2 - signedArea2Row); + f32 deltaSignedArea3 = DQN_ABS(debugSignedArea3 - signedArea3Row); + DQN_ASSERT(deltaSignedArea1 < EPSILON && deltaSignedArea2 < EPSILON && + deltaSignedArea3 < EPSILON) + + f32 debugBarycentricA, debugBarycentricB, debugBarycentricC; + DebugBarycentricInternal(DqnV2_2i(bufferX, bufferY), a.xy, b.xy, c.xy, + &debugBarycentricA, &debugBarycentricB, + &debugBarycentricC); + + f32 deltaBaryB = DQN_ABS(barycentricB - debugBarycentricB); + f32 deltaBaryC = DQN_ABS(barycentricC - debugBarycentricC); + + DQN_ASSERT(deltaBaryB < EPSILON && deltaBaryC < EPSILON) + } + + i32 zBufferIndex = bufferX + (bufferY * zBufferPitch); + f32 pixelZValue = + a.z + (barycentricB * (b.z - a.z)) + (barycentricC * (c.z - a.z)); + f32 currZValue = renderBuffer->zBuffer[zBufferIndex]; + DQN_ASSERT(zBufferIndex < (renderBuffer->width * renderBuffer->height)); + + if (pixelZValue > currZValue) + { + renderBuffer->zBuffer[zBufferIndex] = pixelZValue; + if (texture) + { + u8 *texturePtr = texture->memory; + const u32 texturePitch = texture->bytesPerPixel * texture->dim.w; + + DqnV2 uv = + uv1 + ((uv2 - uv1) * barycentricB) + ((uv3 - uv1) * barycentricC); + + const f32 EPSILON = 0.1f; + DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON); + DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON); + + uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f); + uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f); + + f32 texelXf = uv.x * texture->dim.w; + f32 texelYf = uv.y * texture->dim.h; + DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w); + DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h); + + i32 texelX = (i32)texelXf; + i32 texelY = (i32)texelYf; + + u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) + + (texelY * texturePitch)); + + DqnV4 color1; + color1.a = (f32)(texel1 >> 24); + color1.b = (f32)((texel1 >> 16) & 0xFF); + color1.g = (f32)((texel1 >> 8) & 0xFF); + color1.r = (f32)((texel1 >> 0) & 0xFF); + + color1 *= DTRRENDER_INV_255; + color1 = DTRRender_SRGB1ToLinearSpaceV4(color1); + DqnV4 blend = color * color1; + SetPixel(renderBuffer, bufferX, bufferY, blend, ColorSpace_Linear); + } + else + { + SetPixel(renderBuffer, bufferX, bufferY, color, ColorSpace_Linear); + } + } + } + + signedArea1Row += signedArea1DeltaX; + signedArea2Row += signedArea2DeltaX; + signedArea3Row += signedArea3DeltaX; + } + + signedArea1 += signedArea1DeltaY; + signedArea2 += signedArea2DeltaY; + signedArea3 += signedArea3DeltaY; + } } DTRDebug_EndCycleCount(DTRDebugCycleCount_RenderTriangle_Rasterise); @@ -881,12 +1046,10 @@ void DTRRender_Triangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, if (signedAreaParallelogram == 0) return; f32 invSignedAreaParallelogram = 1 / signedAreaParallelogram; - DTRDebug_BeginCycleCount(DTRDebugCycleCount_RenderTriangle_Rasterise); //////////////////////////////////////////////////////////////////////////// // Scan and Render //////////////////////////////////////////////////////////////////////////// - const u32 zBufferPitch = renderBuffer->width; - const f32 BARYCENTRIC_EPSILON = 0.1f; + const u32 zBufferPitch = renderBuffer->width; for (i32 bufferY = min.y; bufferY < max.y; bufferY++) { f32 signedArea1Row = signedArea1; @@ -903,6 +1066,7 @@ void DTRRender_Triangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, i32 zBufferIndex = bufferX + (bufferY * zBufferPitch); f32 pixelZValue = a.z + (barycentricB * (b.z - a.z)) + (barycentricC * (c.z - a.z)); f32 currZValue = renderBuffer->zBuffer[zBufferIndex]; + DQN_ASSERT(zBufferIndex < (renderBuffer->width * renderBuffer->height)); if (pixelZValue > currZValue) { renderBuffer->zBuffer[zBufferIndex] = pixelZValue; @@ -919,7 +1083,6 @@ void DTRRender_Triangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, signedArea2 += signedArea2DeltaY; signedArea3 += signedArea3DeltaY; } - DTRDebug_EndCycleCount(DTRDebugCycleCount_RenderTriangle_Rasterise); //////////////////////////////////////////////////////////////////////////// // Debug @@ -958,8 +1121,7 @@ void DTRRender_Triangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, } } -void DTRRender_Bitmap(DTRRenderBuffer *const renderBuffer, - DTRBitmap *const bitmap, DqnV2 pos, +void DTRRender_Bitmap(DTRRenderBuffer *const renderBuffer, DTRBitmap *const bitmap, DqnV2 pos, const DTRRenderTransform transform, DqnV4 color) { if (!bitmap || !bitmap->memory || !renderBuffer) return; diff --git a/src/DTRendererRender.h b/src/DTRendererRender.h index 5687099..82c42c6 100644 --- a/src/DTRendererRender.h +++ b/src/DTRendererRender.h @@ -63,7 +63,7 @@ void DTRRender_Text (DTRRenderBuffer *const renderBuffer, const DTRFo void DTRRender_Line (DTRRenderBuffer *const renderBuffer, DqnV2i a, DqnV2i b, DqnV4 color); void DTRRender_Rectangle (DTRRenderBuffer *const renderBuffer, DqnV2 min, DqnV2 max, DqnV4 color, const DTRRenderTransform transform = DTRRender_DefaultTransform()); void DTRRender_Triangle (DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3, DqnV4 color, const DTRRenderTransform transform = DTRRender_DefaultTriangleTransform()); -void DTRRender_TexturedTriangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3, DqnV2 uv1, DqnV2 uv2, DqnV2 uv3, DTRBitmap *const texture, DqnV4 color, const DTRRenderTransform transform = DTRRender_DefaultTriangleTransform()); +void DTRRender_TexturedTriangle(PlatformInput *const input, DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3, DqnV2 uv1, DqnV2 uv2, DqnV2 uv3, DTRBitmap *const texture, DqnV4 color, const DTRRenderTransform transform = DTRRender_DefaultTriangleTransform()); void DTRRender_Bitmap (DTRRenderBuffer *const renderBuffer, DTRBitmap *const bitmap, DqnV2 pos, const DTRRenderTransform transform = DTRRender_DefaultTransform(), DqnV4 color = DqnV4_4f(1, 1, 1, 1)); void DTRRender_Clear (DTRRenderBuffer *const renderBuffer, DqnV3 color); diff --git a/src/build.bat b/src/build.bat index 1fd2bed..9b76365 100644 --- a/src/build.bat +++ b/src/build.bat @@ -39,7 +39,7 @@ REM wd4100 unused argument parameters REM wd4201 nonstandard extension used: nameless struct/union REM wd4189 local variable is initialised but not referenced REM wd4505 unreferenced local function not used will be removed -set CompileFlags=-EHsc -GR- -Oi -MT -Z7 -W4 -wd4100 -wd4201 -wd4189 -wd4505 -Od -FAsc /I..\src\external\ +set CompileFlags=-EHsc -GR- -Oi -MT -Z7 -W4 -wd4100 -wd4201 -wd4189 -wd4505 -O2 -FAsc /I..\src\external\ set DLLFlags=/Fm%ProjectName% /Fo%ProjectName% /Fa%ProjectName% /Fe%ProjectName% set Win32Flags=/FmWin32DTRenderer /FeWin32DTRenderer @@ -62,7 +62,7 @@ REM //////////////////////////////////////////////////////////////////////////// del *.pdb >NUL 2>NUL cl %CompileFlags% %Win32Flags% ..\src\Win32DTRenderer.cpp /link %LinkLibraries% %LinkFlags% REM cl %CompileFlags% %DLLFlags% ..\src\UnityBuild\UnityBuild.cpp /LD /link ..\src\external\easy\easy_profiler.lib /PDB:%ProjectName%_%TimeStamp%.pdb /export:DTR_Update %LinkFlags% -cl %CompileFlags% %DLLFlags% ..\src\UnityBuild\UnityBuild.cpp /LD /link /PDB:%ProjectName%_%TimeStamp%.pdb /export:DTR_Update %LinkFlags% +cl %CompileFlags% %DLLFlags% ..\src\UnityBuild\UnityBuild.cpp /LD /link /PDB:%ProjectName%_%TimeStamp%.pdb /export:DTR_Update %LinkFlags% popd set LastError=%ERRORLEVEL%