From e6602812117cb9ce1a3b34f24b68287b0ccbb2c1 Mon Sep 17 00:00:00 2001 From: Doyle Thai Date: Mon, 19 Jun 2017 14:14:13 +1000 Subject: [PATCH] Fix multithread bug causing invalid ptr references Mesh rendering was not waiting until all jobs were complete before moving on causing longer jobs to use old pointer references once the next frame started rendering. --- src/DTRenderer.cpp | 241 +++++++++++++++++++-------------------- src/DTRenderer.h | 3 +- src/DTRendererPlatform.h | 2 + src/DTRendererRender.cpp | 92 ++++++--------- src/DTRendererRender.h | 9 +- src/Win32DTRenderer.cpp | 74 +++++++++--- src/build.bat | 41 +++++-- 7 files changed, 250 insertions(+), 212 deletions(-) diff --git a/src/DTRenderer.cpp b/src/DTRenderer.cpp index 9916949..eb30646 100644 --- a/src/DTRenderer.cpp +++ b/src/DTRenderer.cpp @@ -899,17 +899,8 @@ extern "C" void DTR_Update(PlatformRenderBuffer *const platformRenderBuffer, //////////////////////////////////////////////////////////////////////// DqnMemStack *const assetStack = &memory->assetStack; DqnMemStack *const tempStack = &memory->tempStack; - state->zDepthLock = input->api.LockInit(&memory->mainStack); - if (state->zDepthLock) - { - state->blitLock = input->api.LockInit(&memory->mainStack); - if (!state->blitLock) - { - // TODO(doyle): Not enough memory die gracefully - DQN_ASSERT(DQN_INVALID_CODE_PATH); - } - } - else + state->renderLock = input->api.LockInit(&memory->mainStack); + if (!state->renderLock) { // TODO(doyle): Not enough memory die gracefully DQN_ASSERT(DQN_INVALID_CODE_PATH); @@ -952,144 +943,149 @@ extern "C" void DTR_Update(PlatformRenderBuffer *const platformRenderBuffer, "byte_read_check.bmp"); } } + } - auto tempMemRegion = DqnMemStackTempRegionScoped(&memory->tempStack); - if (tempMemRegion.isInit) { - size_t debugSize = DQN_MEGABYTE(1); - u8 *debugMemory = (u8 *)DqnMemStack_Push(&memory->tempStack, debugSize); - DqnMemStack_InitWithFixedMem(&globalDebug.memStack, debugMemory, debugSize); - DTRDebug_BeginCycleCount("DTR_Update", DTRDebugCycleCount_DTR_Update); + auto tempMemRegion = DqnMemStackTempRegionScoped(&memory->tempStack); + if (tempMemRegion.isInit) + { + size_t debugSize = DQN_MEGABYTE(1); + u8 *debugMemory = (u8 *)DqnMemStack_Push(&memory->tempStack, debugSize); + DqnMemStack_InitWithFixedMem(&globalDebug.memStack, debugMemory, debugSize); + DTRDebug_BeginCycleCount("DTR_Update", DTRDebugCycleCount_DTR_Update); - DTRRenderBuffer renderBuffer = {}; - renderBuffer.width = platformRenderBuffer->width; - renderBuffer.height = platformRenderBuffer->height; - renderBuffer.bytesPerPixel = platformRenderBuffer->bytesPerPixel; - renderBuffer.memory = (u8 *)platformRenderBuffer->memory; - renderBuffer.zDepthLock = state->zDepthLock; - renderBuffer.blitLock = state->blitLock; + DTRRenderBuffer renderBuffer = {}; + renderBuffer.width = platformRenderBuffer->width; + renderBuffer.height = platformRenderBuffer->height; + renderBuffer.bytesPerPixel = platformRenderBuffer->bytesPerPixel; + renderBuffer.memory = (u8 *)platformRenderBuffer->memory; + renderBuffer.renderLock = state->renderLock; - u32 zBufferSize = platformRenderBuffer->width * platformRenderBuffer->height; - renderBuffer.zBuffer = (f32 *)DqnMemStack_Push(&memory->tempStack, - zBufferSize * sizeof(*renderBuffer.zBuffer)); + u32 zBufferSize = platformRenderBuffer->width * platformRenderBuffer->height; + renderBuffer.zBuffer = (f32 *)DqnMemStack_Push( + &memory->tempStack, zBufferSize * sizeof(*renderBuffer.zBuffer)); - for (u32 i = 0; i < zBufferSize; i++) - renderBuffer.zBuffer[i] = DQN_F32_MIN; + for (u32 i = 0; i < zBufferSize; i++) + renderBuffer.zBuffer[i] = DQN_F32_MIN; - DTRRenderContext renderContext = {}; - renderContext.renderBuffer = &renderBuffer; - renderContext.tempStack = &memory->tempStack; - renderContext.api = &input->api; - //////////////////////////////////////////////////////////////////////////// - // Update and Render - //////////////////////////////////////////////////////////////////////////// - DTRRender_Clear(renderContext, DqnV3_3f(0.5f, 0.0f, 1.0f)); + DTRRenderContext renderContext = {}; + renderContext.renderBuffer = &renderBuffer; + renderContext.tempStack = &memory->tempStack; + renderContext.api = &input->api; + //////////////////////////////////////////////////////////////////////////// + // Update and Render + //////////////////////////////////////////////////////////////////////////// + DTRRender_Clear(renderContext, DqnV3_3f(0.5f, 0.0f, 1.0f)); #if 1 - DqnV4 colorRed = DqnV4_4f(0.8f, 0, 0, 1); - DqnV2i bufferMidP = DqnV2i_2f(renderBuffer.width * 0.5f, renderBuffer.height * 0.5f); - f32 rotation = (f32)input->timeNowInS * 0.25f; + DqnV4 colorRed = DqnV4_4f(0.8f, 0, 0, 1); + DqnV2i bufferMidP = DqnV2i_2f(renderBuffer.width * 0.5f, renderBuffer.height * 0.5f); + f32 rotation = (f32)input->timeNowInS * 0.25f; - // Triangle Drawing - { - DqnV4 redTransparent = DqnV4_4f(1, 0, 0, 0.5f); - - i32 boundsOffset = 100; - DqnV3 t0[3] = {DqnV3_3i(10, 70, 0), DqnV3_3i(50, 160, 0), DqnV3_3i(70, 80, 0)}; - DqnV3 t1[3] = {DqnV3_3i(180, 50, 0), DqnV3_3i(150, 1, 0), DqnV3_3i(70, 180, 0)}; - DqnV3 t2[3] = {DqnV3_3i(180, 150, 0), DqnV3_3i(120, 160, 0), DqnV3_3i(130, 180, 0)}; - DqnV3 t3[3] = {DqnV3_3i(boundsOffset, boundsOffset, 0), - DqnV3_3i(bufferMidP.w, renderBuffer.height - boundsOffset, 0), - DqnV3_3i(renderBuffer.width - boundsOffset, boundsOffset, 0)}; - DqnV3 t4[3] = {DqnV3_3i(100, 150, 0), DqnV3_3i(200, 150, 0), DqnV3_3i(200, 250, 0)}; - DqnV3 t5[3] = {DqnV3_3i(300, 150, 0), DqnV3_3i(201, 150, 0), DqnV3_3i(200, 250, 0)}; - - DTRRenderTransform rotatingXform = DTRRender_DefaultTriangleTransform(); - rotatingXform.rotation = rotation; - - if (0) + // Triangle Drawing { - DTRDebug_BeginCycleCount("DTR_Update_RenderPrimitiveTriangles", - DTRDebugCycleCount_DTR_Update_RenderPrimitiveTriangles); + DqnV4 redTransparent = DqnV4_4f(1, 0, 0, 0.5f); - DTRRender_Triangle(renderContext, t0[0], t0[1], t0[2], colorRed); - DTRRender_Triangle(renderContext, t1[0], t1[1], t1[2], colorRed); - DTRRender_Triangle(renderContext, t3[0], t3[1], t3[2], colorRed, rotatingXform); - DTRRender_Triangle(renderContext, t2[0], t2[1], t2[2], colorRed); - DTRRender_Triangle(renderContext, t4[0], t4[1], t4[2], colorRed); - DTRRender_Triangle(renderContext, t5[0], t5[1], t5[2], colorRed); - DTRDebug_EndCycleCount(DTRDebugCycleCount_DTR_Update_RenderPrimitiveTriangles); - } + i32 boundsOffset = 100; + DqnV3 t0[3] = {DqnV3_3i(10, 70, 0), DqnV3_3i(50, 160, 0), DqnV3_3i(70, 80, 0)}; + DqnV3 t1[3] = {DqnV3_3i(180, 50, 0), DqnV3_3i(150, 1, 0), DqnV3_3i(70, 180, 0)}; + DqnV3 t2[3] = {DqnV3_3i(180, 150, 0), DqnV3_3i(120, 160, 0), DqnV3_3i(130, 180, 0)}; + DqnV3 t3[3] = {DqnV3_3i(boundsOffset, boundsOffset, 0), + DqnV3_3i(bufferMidP.w, renderBuffer.height - boundsOffset, 0), + DqnV3_3i(renderBuffer.width - boundsOffset, boundsOffset, 0)}; + DqnV3 t4[3] = {DqnV3_3i(100, 150, 0), DqnV3_3i(200, 150, 0), DqnV3_3i(200, 250, 0)}; + DqnV3 t5[3] = {DqnV3_3i(300, 150, 0), DqnV3_3i(201, 150, 0), DqnV3_3i(200, 250, 0)}; - if (1) - { - LOCAL_PERSIST bool runTinyRendererOnce = false; - if (1 && runTinyRendererOnce) + DTRRenderTransform rotatingXform = DTRRender_DefaultTriangleTransform(); + rotatingXform.rotation = rotation; + + if (0) { - DTRDebug_RunTinyRenderer(); - runTinyRendererOnce = false; + DTRDebug_BeginCycleCount( + "DTR_Update_RenderPrimitiveTriangles", + DTRDebugCycleCount_DTR_Update_RenderPrimitiveTriangles); + + DTRRender_Triangle(renderContext, t0[0], t0[1], t0[2], colorRed); + DTRRender_Triangle(renderContext, t1[0], t1[1], t1[2], colorRed); + DTRRender_Triangle(renderContext, t3[0], t3[1], t3[2], colorRed, rotatingXform); + DTRRender_Triangle(renderContext, t2[0], t2[1], t2[2], colorRed); + DTRRender_Triangle(renderContext, t4[0], t4[1], t4[2], colorRed); + DTRRender_Triangle(renderContext, t5[0], t5[1], t5[2], colorRed); + DTRDebug_EndCycleCount(DTRDebugCycleCount_DTR_Update_RenderPrimitiveTriangles); } - DTRDebug_BeginCycleCount("DTR_Update_RenderModel", - DTRDebugCycleCount_DTR_Update_RenderModel); - //////////////////////////////////////////////////////////////////////// - // Draw Loaded Model - //////////////////////////////////////////////////////////////////////// - const DqnV3 LIGHT = DqnV3_Normalise(DqnV3_3f(1, -1, 1.0f)); - const f32 MODEL_SCALE = 1; - DTRMesh *const mesh = &state->mesh; - DqnV3 modelP = DqnV3_3f(0, 0, 0); + if (1) + { + LOCAL_PERSIST bool runTinyRendererOnce = false; + if (1 && runTinyRendererOnce) + { + DTRDebug_RunTinyRenderer(); + runTinyRendererOnce = false; + } - LOCAL_PERSIST f32 modelRotation = 0; - modelRotation += (input->deltaForFrame * 20.0f); - DqnV3 axis = DqnV3_3f(0, 1, 0); + DTRDebug_BeginCycleCount("DTR_Update_RenderModel", + DTRDebugCycleCount_DTR_Update_RenderModel); + //////////////////////////////////////////////////////////////////////// + // Draw Loaded Model + //////////////////////////////////////////////////////////////////////// + const DqnV3 LIGHT = DqnV3_Normalise(DqnV3_3f(1, -1, 1.0f)); + const f32 MODEL_SCALE = 1; + DTRMesh *const mesh = &state->mesh; + DqnV3 modelP = DqnV3_3f(0, 0, 0); - DTRRenderTransform transform = DTRRender_DefaultTransform(); - transform.scale = DqnV3_1f(MODEL_SCALE); - transform.rotation = modelRotation; - transform.anchor = axis; + LOCAL_PERSIST f32 modelRotation = 0; + modelRotation += (input->deltaForFrame * 20.0f); + DqnV3 axis = DqnV3_3f(0, 1, 0); - DTRRenderLight lighting = {}; - lighting.mode = DTRRenderShadingMode_Gouraud; - lighting.vector = LIGHT; - lighting.color = DqnV4_4f(1, 1, 1, 1); + DTRRenderTransform transform = DTRRender_DefaultTransform(); + transform.scale = DqnV3_1f(MODEL_SCALE); + transform.rotation = modelRotation; + transform.anchor = axis; - DTRRender_Mesh(renderContext, input->jobQueue, mesh, lighting, modelP, transform); - DTRDebug_EndCycleCount(DTRDebugCycleCount_DTR_Update_RenderModel); + DTRRenderLight lighting = {}; + lighting.mode = DTRRenderShadingMode_Gouraud; + lighting.vector = LIGHT; + lighting.color = DqnV4_4f(1, 1, 1, 1); + + DTRRender_Mesh(renderContext, input->jobQueue, mesh, lighting, modelP, + transform); + DTRDebug_EndCycleCount(DTRDebugCycleCount_DTR_Update_RenderModel); + } } - } - // Rect drawing - if (0) - { - DTRRenderTransform transform = DTRRender_DefaultTransform(); - transform.rotation = rotation + 45; + // Rect drawing + if (0) + { + DTRRenderTransform transform = DTRRender_DefaultTransform(); + transform.rotation = rotation + 45; - DTRRender_Rectangle(renderContext, DqnV2_1f(300.0f), DqnV2_1f(300 + 100.0f), - DqnV4_4f(0, 1.0f, 1.0f, 1.0f), transform); - } + DTRRender_Rectangle(renderContext, DqnV2_1f(300.0f), DqnV2_1f(300 + 100.0f), + DqnV4_4f(0, 1.0f, 1.0f, 1.0f), transform); + } - // Bitmap drawing - if (0) - { - DTRRenderTransform transform = DTRRender_DefaultTransform(); - transform.scale = DqnV3_1f(2.0f); + // Bitmap drawing + if (0) + { + DTRRenderTransform transform = DTRRender_DefaultTransform(); + transform.scale = DqnV3_1f(2.0f); - LOCAL_PERSIST DqnV2 bitmapP = DqnV2_2f(500, 250); - bitmapP.x += 2.0f * sinf((f32)input->timeNowInS * 0.5f); + LOCAL_PERSIST DqnV2 bitmapP = DqnV2_2f(500, 250); + bitmapP.x += 2.0f * sinf((f32)input->timeNowInS * 0.5f); - f32 cAngle = (f32)input->timeNowInS; - DqnV4 color = DqnV4_4f(0.5f + 0.5f * sinf(cAngle), 0.5f + 0.5f * sinf(2.9f * cAngle), - 0.5f + 0.5f * cosf(10.0f * cAngle), 1.0f); - DTRRender_Bitmap(renderContext, &state->bitmap, bitmapP, transform, color); - } + f32 cAngle = (f32)input->timeNowInS; + DqnV4 color = + DqnV4_4f(0.5f + 0.5f * sinf(cAngle), 0.5f + 0.5f * sinf(2.9f * cAngle), + 0.5f + 0.5f * cosf(10.0f * cAngle), 1.0f); + DTRRender_Bitmap(renderContext, &state->bitmap, bitmapP, transform, color); + } #else - // CompAssignment(renderBuffer, input, memory); +// CompAssignment(renderBuffer, input, memory); #endif - DTRDebug_EndCycleCount(DTRDebugCycleCount_DTR_Update); - DTRDebug_Update(state, renderContext, input, memory); + DTRDebug_EndCycleCount(DTRDebugCycleCount_DTR_Update); + DTRDebug_Update(state, renderContext, input, memory); + } } //////////////////////////////////////////////////////////////////////////// @@ -1097,13 +1093,12 @@ extern "C" void DTR_Update(PlatformRenderBuffer *const platformRenderBuffer, //////////////////////////////////////////////////////////////////////////// if (DTR_DEBUG) { - // NOTE: We should have one temp region, that is the scoped region for the - // main loop which will remove itself when we leave scope. - DQN_ASSERT(memory->tempStack.tempRegionCount == 1); + DQN_ASSERT(input->api.QueueAllJobsComplete(input->jobQueue)); for (i32 i = 0; i < DQN_ARRAY_COUNT(memory->stacks); i++) { if (&memory->stacks[i] == &memory->tempStack) continue; DQN_ASSERT(memory->stacks[i].tempRegionCount == 0); } + DqnMemStack_ClearCurrBlock(&memory->tempStack, true); } } diff --git a/src/DTRenderer.h b/src/DTRenderer.h index 849d635..4abede4 100644 --- a/src/DTRenderer.h +++ b/src/DTRenderer.h @@ -14,8 +14,7 @@ typedef struct DTRState DTRBitmap bitmap; DTRMesh mesh; - struct PlatformLock *zDepthLock; - struct PlatformLock *blitLock; + struct PlatformLock *renderLock; } DTRState; extern PlatformFlags globalDTRPlatformFlags; diff --git a/src/DTRendererPlatform.h b/src/DTRendererPlatform.h index 348e5bf..d44f37f 100644 --- a/src/DTRendererPlatform.h +++ b/src/DTRendererPlatform.h @@ -51,6 +51,7 @@ typedef struct PlatformJob // Multithreading API typedef bool PlatformAPI_QueueAddJob (PlatformJobQueue *const queue, const PlatformJob job); typedef bool PlatformAPI_QueueTryExecuteNextJob(PlatformJobQueue *const queue); +typedef bool PlatformAPI_QueueAllJobsComplete (PlatformJobQueue *const queue); //////////////////////////////////////////////////////////////////////////////// // Platform Locks @@ -75,6 +76,7 @@ typedef struct PlatformAPI PlatformAPI_QueueAddJob *QueueAddJob; PlatformAPI_QueueTryExecuteNextJob *QueueTryExecuteNextJob; + PlatformAPI_QueueAllJobsComplete *QueueAllJobsComplete; PlatformAPI_LockInit *LockInit; PlatformAPI_LockAcquire *LockAcquire; diff --git a/src/DTRendererRender.cpp b/src/DTRendererRender.cpp index c5c97e9..d5bfc5d 100644 --- a/src/DTRendererRender.cpp +++ b/src/DTRendererRender.cpp @@ -630,22 +630,20 @@ FILE_SCOPE inline void SIMDSetPixel(DTRRenderContext context, const i32 x, const DTR_DEBUG_EP_TIMED_FUNCTION(); DebugSIMDAssertColorInRange(color, 0.0f, 1.0f); - u32 *const bitmapPtr = (u32 *)renderBuffer->memory; - const u32 pitchInU32 = (renderBuffer->width * renderBuffer->bytesPerPixel) / 4; - // If some alpha is involved, we need to apply gamma correction, but if the // new pixel is totally opaque or invisible then we're just flat out // overwriting/keeping the state of the pixel so we can save cycles by skipping. f32 alpha = ((f32 *)&color)[3]; - bool needGammaFix = (alpha > 0.0f || alpha < (1.0f + COLOR_EPSILON)) && (colorSpace == ColorSpace_SRGB); + bool needGammaFix = + (alpha > 0.0f || alpha < (1.0f + COLOR_EPSILON)) && (colorSpace == ColorSpace_SRGB); if (needGammaFix) color = SIMDSRGB1ToLinearSpace(color); // Format: u32 == (XX, RR, GG, BB) - context.api->LockAcquire(renderBuffer->blitLock); + u32 *const bitmapPtr = (u32 *)renderBuffer->memory; + const u32 pitchInU32 = (renderBuffer->width * renderBuffer->bytesPerPixel) / 4; + u32 srcPixel = bitmapPtr[x + (y * pitchInU32)]; - __m128 src = _mm_set_ps(0, - (f32)((srcPixel >> 0) & 0xFF), - (f32)((srcPixel >> 8) & 0xFF), + __m128 src = _mm_set_ps(0, (f32)((srcPixel >> 0) & 0xFF), (f32)((srcPixel >> 8) & 0xFF), (f32)((srcPixel >> 16) & 0xFF)); src = SIMDSRGB255ToLinearSpace1(src); @@ -665,13 +663,8 @@ FILE_SCOPE inline void SIMDSetPixel(DTRRenderContext context, const i32 x, const f32 destB = ((f32 *)&dest)[2]; u32 pixel = // ((u32)(destA) << 24 | - (u32)(destR) << 16 | - (u32)(destG) << 8 | - (u32)(destB) << 0; + (u32)(destR) << 16 | (u32)(destG) << 8 | (u32)(destB) << 0; bitmapPtr[x + (y * pitchInU32)] = pixel; - context.api->LockRelease(renderBuffer->blitLock); - - DTRDebug_CounterIncrement(DTRDebugCounter_SetPixels); } // colorModulate: _mm_set_ps(a, b, g, r) ie. 0=r, 1=g, 2=b, 3=a @@ -783,9 +776,9 @@ FILE_SCOPE inline f32 GetCurrZDepth(DTRRenderContext context, i32 posX, i32 posY i32 zBufferIndex = posX + (posY * zBufferPitch); DQN_ASSERT(zBufferIndex < (renderBuffer->width * renderBuffer->height)); - context.api->LockAcquire(renderBuffer->zDepthLock); + context.api->LockAcquire(renderBuffer->renderLock); f32 currZDepth = renderBuffer->zBuffer[zBufferIndex]; - context.api->LockRelease(renderBuffer->zDepthLock); + context.api->LockRelease(renderBuffer->renderLock); return currZDepth; } @@ -798,9 +791,9 @@ FILE_SCOPE inline void SetCurrZDepth(DTRRenderContext context, i32 posX, i32 pos i32 zBufferIndex = posX + (posY * zBufferPitch); DQN_ASSERT(zBufferIndex < (renderBuffer->width * renderBuffer->height)); - context.api->LockAcquire(renderBuffer->zDepthLock); + context.api->LockAcquire(renderBuffer->renderLock); renderBuffer->zBuffer[zBufferIndex] = newZDepth; - context.api->LockRelease(renderBuffer->zDepthLock); + context.api->LockRelease(renderBuffer->renderLock); } #define DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(type) \ @@ -882,7 +875,7 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context, DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_Preamble); - DTRRenderBuffer *renderBuffer = context.renderBuffer; + DTRRenderBuffer *const renderBuffer = context.renderBuffer; //////////////////////////////////////////////////////////////////////////// // Convert color //////////////////////////////////////////////////////////////////////////// @@ -911,12 +904,12 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context, const u32 NUM_Y_PIXELS_TO_SIMD = 1; // SignedArea: _mm_set_ps(unused, p3, p2, p1) ie 0=p1, 1=p1, 2=p3, 3=unused - __m128 signedAreaPixel1; - __m128 signedAreaPixel2; + __m128 signedAreaPixel1 = _mm_set_ps1(0); + __m128 signedAreaPixel2 = _mm_set_ps1(0); - __m128 signedAreaPixelDeltaX; - __m128 signedAreaPixelDeltaY; - __m128 invSignedAreaParallelogram_4x; + __m128 signedAreaPixelDeltaX = _mm_set_ps1(0); + __m128 signedAreaPixelDeltaY = _mm_set_ps1(0); + __m128 invSignedAreaParallelogram_4x = _mm_set_ps1(0); __m128 triangleZ = _mm_set_ps(0, p3.z, p2.z, p1.z); { @@ -1003,22 +996,11 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context, ((f32 *)&barycentricZ)[1] + ((f32 *)&barycentricZ)[2]; -#if 0 - // f32 currZDepth = GetCurrZDepth(context, posX, posY); -#else - DQN_ASSERT(renderBuffer); i32 zBufferIndex = posX + (posY * zBufferPitch); - - context.api->LockAcquire(renderBuffer->zDepthLock); - f32 currZDepth = renderBuffer->zBuffer[zBufferIndex]; - context.api->LockRelease(renderBuffer->zDepthLock); -#endif - if (pixelZDepth > currZDepth) + context.api->LockAcquire(renderBuffer->renderLock); + if (pixelZDepth > renderBuffer->zBuffer[zBufferIndex]) { - - context.api->LockAcquire(renderBuffer->zDepthLock); renderBuffer->zBuffer[zBufferIndex] = pixelZDepth; - context.api->LockRelease(renderBuffer->zDepthLock); __m128 finalColor = simdColor; if (!ignoreLight) @@ -1046,6 +1028,7 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context, } SIMDSetPixel(context, posX, posY, finalColor, ColorSpace_Linear); } + context.api->LockRelease(renderBuffer->renderLock); DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_RasterisePixel); } signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX); @@ -1067,16 +1050,10 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context, ((f32 *)&barycentricZ)[1] + ((f32 *)&barycentricZ)[2]; i32 zBufferIndex = posX + (posY * zBufferPitch); - - context.api->LockAcquire(renderBuffer->zDepthLock); - f32 currZDepth = renderBuffer->zBuffer[zBufferIndex]; - context.api->LockRelease(renderBuffer->zDepthLock); - - if (pixelZDepth > currZDepth) + context.api->LockAcquire(renderBuffer->renderLock); + if (pixelZDepth > renderBuffer->zBuffer[zBufferIndex]) { - context.api->LockAcquire(renderBuffer->zDepthLock); renderBuffer->zBuffer[zBufferIndex] = pixelZDepth; - context.api->LockRelease(renderBuffer->zDepthLock); __m128 finalColor = simdColor; if (!ignoreLight) @@ -1104,6 +1081,7 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context, } SIMDSetPixel(context, posX, posY, finalColor, ColorSpace_Linear); } + context.api->LockRelease(renderBuffer->renderLock); } signedArea2 = _mm_add_ps(signedArea2, signedAreaPixelDeltaX); } @@ -1424,7 +1402,7 @@ typedef struct RenderMeshJob DqnV4 color; } RenderMeshJob; -void MultiThreadedRenderMesh(struct PlatformJobQueue *const queue, void *const userData) +void MultiThreadedRenderMesh(PlatformJobQueue *const queue, void *const userData) { if (!queue || !userData) { @@ -1433,18 +1411,16 @@ void MultiThreadedRenderMesh(struct PlatformJobQueue *const queue, void *const u } RenderMeshJob *job = (RenderMeshJob *)userData; -#if 1 TexturedTriangleInternal(job->context, job->lighting, job->v1, job->v2, job->v3, job->uv1, job->uv2, job->uv3, job->tex, job->color); -#endif } void DTRRender_Mesh(DTRRenderContext context, PlatformJobQueue *const jobQueue, DTRMesh *const mesh, DTRRenderLight lighting, const DqnV3 pos, const DTRRenderTransform transform) { - DqnMemStack *tempStack = context.tempStack; - DTRRenderBuffer *renderBuffer = context.renderBuffer; - PlatformAPI *api = context.api; + DqnMemStack *const tempStack = context.tempStack; + DTRRenderBuffer *const renderBuffer = context.renderBuffer; + PlatformAPI *const api = context.api; if (!mesh || !renderBuffer || !tempStack || !api || !jobQueue) return; @@ -1477,7 +1453,7 @@ void DTRRender_Mesh(DTRRenderContext context, PlatformJobQueue *const jobQueue, viewPModelViewProjection = DqnMat4_Mul(viewport, modelViewProjection); } - bool RUN_MULTITHREADED = false; + bool RUN_MULTITHREADED = true; for (u32 i = 0; i < mesh->numFaces; i++) { DTRMeshFace face = mesh->faces[i]; @@ -1549,7 +1525,8 @@ void DTRRender_Mesh(DTRRenderContext context, PlatformJobQueue *const jobQueue, DqnV2 uv2 = mesh->texUV[uv2Index].xy; DqnV2 uv3 = mesh->texUV[uv3Index].xy; - DqnV4 color = lighting.color; + DqnV4 color = lighting.color; + RenderLightInternal lightingInternal = {}; lightingInternal.mode = lighting.mode; lightingInternal.vector = lighting.vector; @@ -1558,8 +1535,7 @@ void DTRRender_Mesh(DTRRenderContext context, PlatformJobQueue *const jobQueue, lightingInternal.normals[2] = norm3; lightingInternal.numNormals = 3; - bool DEBUG_NO_TEX = false; - + bool DEBUG_NO_TEX = false; if (RUN_MULTITHREADED) { RenderMeshJob *jobData = (RenderMeshJob *)DqnMemStack_Push(tempStack, sizeof(*jobData)); @@ -1612,6 +1588,7 @@ void DTRRender_Mesh(DTRRenderContext context, PlatformJobQueue *const jobQueue, uv1, uv2, uv3, &mesh->tex, color); } } + bool DEBUG_WIREFRAME = false; if (DTR_DEBUG && DEBUG_WIREFRAME) { @@ -1622,9 +1599,11 @@ void DTRRender_Mesh(DTRRenderContext context, PlatformJobQueue *const jobQueue, } } + // NOTE(doyle): Complete remaining jobs and wait until all jobs finished + // before leaving function. if (RUN_MULTITHREADED) { - while (api->QueueTryExecuteNextJob(jobQueue)) + while (api->QueueTryExecuteNextJob(jobQueue) || !api->QueueAllJobsComplete(jobQueue)) ; } } @@ -1858,3 +1837,4 @@ void DTRRender_Clear(DTRRenderContext context, DqnV3 color) } } } + diff --git a/src/DTRendererRender.h b/src/DTRendererRender.h index 3d4f6fc..81d1c21 100644 --- a/src/DTRendererRender.h +++ b/src/DTRendererRender.h @@ -16,12 +16,9 @@ typedef struct DTRRenderBuffer i32 width; i32 height; i32 bytesPerPixel; - - PlatformLock *volatile zDepthLock; - PlatformLock *volatile blitLock; - - u8 *memory; // Format: XX RR GG BB, and has (width * height * bytesPerPixels) elements - f32 *zBuffer; // zBuffer has (width * height) elements + PlatformLock *renderLock; + volatile u8 *memory; // Format: XX RR GG BB, and has (width * height * bytesPerPixels) elements + volatile f32 *zBuffer; // zBuffer has (width * height) elements } DTRRenderBuffer; diff --git a/src/Win32DTRenderer.cpp b/src/Win32DTRenderer.cpp index c8a25f4..b1b087f 100644 --- a/src/Win32DTRenderer.cpp +++ b/src/Win32DTRenderer.cpp @@ -64,9 +64,11 @@ struct PlatformJobQueue // NOTE: Modified by main+worker threads LONG volatile jobToExecuteIndex; HANDLE volatile win32Semaphore; + LONG volatile numJobsToComplete; // NOTE: Modified by main thread ONLY LONG volatile jobInsertIndex; + }; bool Platform_QueueAddJob(PlatformJobQueue *const queue, const PlatformJob job) @@ -76,22 +78,12 @@ bool Platform_QueueAddJob(PlatformJobQueue *const queue, const PlatformJob job) queue->jobList[queue->jobInsertIndex] = job; - _WriteBarrier(); - _mm_sfence(); - - queue->jobInsertIndex = newJobInsertIndex; + InterlockedIncrement(&queue->numJobsToComplete); ReleaseSemaphore(queue->win32Semaphore, 1, NULL); - + queue->jobInsertIndex = newJobInsertIndex; return true; } -FILE_SCOPE void DebugWin32JobPrintNumber(PlatformJobQueue *const queue, void *const userData) -{ - i32 numberToPrint = *((i32 *)userData); - DqnWin32_OutputDebugString("Thread %d: Printing number: %d\n", GetCurrentThreadId(), - numberToPrint); -} - bool Platform_QueueTryExecuteNextJob(PlatformJobQueue *const queue) { LONG originalJobToExecute = queue->jobToExecuteIndex; @@ -109,6 +101,7 @@ bool Platform_QueueTryExecuteNextJob(PlatformJobQueue *const queue) { PlatformJob job = queue->jobList[index]; job.callback(queue, job.userData); + InterlockedDecrement(&queue->numJobsToComplete); } return true; @@ -117,6 +110,37 @@ bool Platform_QueueTryExecuteNextJob(PlatformJobQueue *const queue) return false; } +bool Platform_QueueAllJobsComplete(PlatformJobQueue *const queue) +{ + bool result = (queue->numJobsToComplete == 0); + return result; +} + +FILE_SCOPE u32 volatile globalDebugCounter; +FILE_SCOPE bool volatile globalDebugCounterMemoize[2048]; +FILE_SCOPE PlatformLock *globalDebugLock; +FILE_SCOPE void DebugWin32IncrementCounter(PlatformJobQueue *const queue, void *const userData) +{ + Platform_LockAcquire(globalDebugLock); + DQN_ASSERT(!globalDebugCounterMemoize[globalDebugCounter]); + globalDebugCounterMemoize[globalDebugCounter] = true; + globalDebugCounter++; + u32 number = globalDebugCounter; + Platform_LockRelease(globalDebugLock); + + DqnWin32_OutputDebugString("Thread %d: Incrementing Number: %d\n", GetCurrentThreadId(), + number); +} + +FILE_SCOPE void DebugWin32JobPrintNumber(PlatformJobQueue *const queue, void *const userData) +{ + i32 numberToPrint = *((i32 *)userData); + DqnWin32_OutputDebugString("Thread %d: Printing number: %d\n", GetCurrentThreadId(), + numberToPrint); +} + + + DWORD WINAPI Win32ThreadCallback(void *lpParameter) { PlatformJobQueue *queue = (PlatformJobQueue *)lpParameter; @@ -682,6 +706,7 @@ int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPWSTR lpCmdLi platformAPI.QueueAddJob = Platform_QueueAddJob; platformAPI.QueueTryExecuteNextJob = Platform_QueueTryExecuteNextJob; + platformAPI.QueueAllJobsComplete = Platform_QueueAllJobsComplete; platformAPI.LockInit = Platform_LockInit; platformAPI.LockAcquire = Platform_LockAcquire; @@ -795,7 +820,7 @@ int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPWSTR lpCmdLi } #if 0 - // DEBUG Create jobs + // DEBUG Create print jobs for (i32 i = 0; i < 20; i++) { PlatformJob job = {}; @@ -811,6 +836,29 @@ int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPWSTR lpCmdLi while (Platform_QueueTryExecuteNextJob(&jobQueue)) ; #endif + +#if 1 + globalDebugLock = Platform_LockInit(&globalPlatformMemory.mainStack); + DQN_ASSERT(globalDebugLock); + for (i32 i = 0; i < DQN_ARRAY_COUNT(globalDebugCounterMemoize); i++) + { + PlatformJob job = {}; + job.callback = DebugWin32IncrementCounter; + while (!Platform_QueueAddJob(&jobQueue, job)) + { + Platform_QueueTryExecuteNextJob(&jobQueue); + } + } + + while (Platform_QueueTryExecuteNextJob(&jobQueue)) + ; + + for (i32 i = 0; i < DQN_ARRAY_COUNT(globalDebugCounterMemoize); i++) + DQN_ASSERT(globalDebugCounterMemoize[i]); + + DqnWin32_OutputDebugString("\nFinal incremented value: %d\n", globalDebugCounter); + DQN_ASSERT(globalDebugCounter == DQN_ARRAY_COUNT(globalDebugCounterMemoize)); +#endif } else { diff --git a/src/build.bat b/src/build.bat index 9b76365..c30124b 100644 --- a/src/build.bat +++ b/src/build.bat @@ -29,7 +29,6 @@ REM EHa- disable exception handling (currently it's on /EHsc since libraries n REM GR- disable c runtime type information (we don't use) REM MD use dynamic runtime library REM MT use static runtime library, so build and link it into exe -REM Od disables optimisations REM Oi enable intrinsics optimisation, let us use CPU intrinsics if there is one REM instead of generating a call to external library (i.e. CRT). REM Zi enables debug data, Z7 combines the debug files into one. @@ -39,26 +38,44 @@ REM wd4100 unused argument parameters REM wd4201 nonstandard extension used: nameless struct/union REM wd4189 local variable is initialised but not referenced REM wd4505 unreferenced local function not used will be removed -set CompileFlags=-EHsc -GR- -Oi -MT -Z7 -W4 -wd4100 -wd4201 -wd4189 -wd4505 -O2 -FAsc /I..\src\external\ +set CompileFlags=-EHsc -GR- -Oi -MT -Z7 -W4 -wd4100 -wd4201 -wd4189 -wd4505 -FAsc /I..\src\external\ set DLLFlags=/Fm%ProjectName% /Fo%ProjectName% /Fa%ProjectName% /Fe%ProjectName% set Win32Flags=/FmWin32DTRenderer /FeWin32DTRenderer +REM Link libraries +set LinkLibraries=user32.lib kernel32.lib gdi32.lib + +REM incremental:no, turn incremental builds off +REM opt:ref, try to remove functions from libs that are not referenced at all +set LinkFlags=-incremental:no -opt:ref -subsystem:WINDOWS -machine:x64 -nologo + +set DebugMode=0 + +if %DebugMode%==1 goto :DebugFlags +goto :ReleaseFlags + +:DebugFlags +REM Od disables optimisations +REM RTC1 runtime error checks +set CompileFlags=%CompileFlags% -Od -RTC1 +goto compile + +:ReleaseFlags +REM opt:icf, COMDAT folding for debugging release build +REM DEBUG:[FULL|NONE] enforce debugging for release build +set CompileFlags=%CompileFlags% -O2 +set LinkFlags=%LinkFlags% + +REM //////////////////////////////////////////////////////////////////////////// +REM Compile +REM //////////////////////////////////////////////////////////////////////////// +:compile REM Clean time necessary for hours <10, which produces H:MM:SS.SS where the REM first character of time is an empty space. CleanTime will pad a 0 if REM necessary. set CleanTime=%time: =0% set TimeStamp=%date:~10,4%%date:~7,2%%date:~4,2%_%CleanTime:~0,2%%CleanTime:~3,2%%CleanTime:~6,2% -REM Link libraries -set LinkLibraries=user32.lib kernel32.lib gdi32.lib - -REM incremental:no, turn incremental builds off -REM opt:ref, try to remove functions from libs that are not referenced at all -set LinkFlags=-incremental:no -opt:ref -subsystem:WINDOWS -machine:x64 -nologo - -REM //////////////////////////////////////////////////////////////////////////// -REM Compile -REM //////////////////////////////////////////////////////////////////////////// del *.pdb >NUL 2>NUL cl %CompileFlags% %Win32Flags% ..\src\Win32DTRenderer.cpp /link %LinkLibraries% %LinkFlags% REM cl %CompileFlags% %DLLFlags% ..\src\UnityBuild\UnityBuild.cpp /LD /link ..\src\external\easy\easy_profiler.lib /PDB:%ProjectName%_%TimeStamp%.pdb /export:DTR_Update %LinkFlags%