From e860145e77ae628dc0ead76cd61fae907e1619b3 Mon Sep 17 00:00:00 2001 From: Doyle Thai Date: Mon, 19 Jun 2017 18:22:02 +1000 Subject: [PATCH] Lock pixels for multithread more granularly level --- src/DTRenderer.cpp | 15 ++- src/DTRendererPlatform.h | 3 + src/DTRendererRender.cpp | 234 ++++++++++++++++++--------------------- src/DTRendererRender.h | 9 +- src/Win32DTRenderer.cpp | 17 ++- 5 files changed, 142 insertions(+), 136 deletions(-) diff --git a/src/DTRenderer.cpp b/src/DTRenderer.cpp index eb30646..405d8cb 100644 --- a/src/DTRenderer.cpp +++ b/src/DTRenderer.cpp @@ -966,13 +966,20 @@ extern "C" void DTR_Update(PlatformRenderBuffer *const platformRenderBuffer, renderBuffer.zBuffer = (f32 *)DqnMemStack_Push( &memory->tempStack, zBufferSize * sizeof(*renderBuffer.zBuffer)); - for (u32 i = 0; i < zBufferSize; i++) - renderBuffer.zBuffer[i] = DQN_F32_MIN; + renderBuffer.pixelLockTable = (bool *)DqnMemStack_Push( + &memory->tempStack, zBufferSize * sizeof(*renderBuffer.pixelLockTable)); + + for (u32 i = 0; i < zBufferSize; i++) + { + renderBuffer.zBuffer[i] = DQN_F32_MIN; + renderBuffer.pixelLockTable[i] = false; + } DTRRenderContext renderContext = {}; renderContext.renderBuffer = &renderBuffer; renderContext.tempStack = &memory->tempStack; renderContext.api = &input->api; + renderContext.jobQueue = input->jobQueue; //////////////////////////////////////////////////////////////////////////// // Update and Render //////////////////////////////////////////////////////////////////////////// @@ -1086,6 +1093,10 @@ extern "C" void DTR_Update(PlatformRenderBuffer *const platformRenderBuffer, DTRDebug_EndCycleCount(DTRDebugCycleCount_DTR_Update); DTRDebug_Update(state, renderContext, input, memory); } + + while (input->api.QueueTryExecuteNextJob(input->jobQueue) || + !input->api.QueueAllJobsComplete(input->jobQueue)) + ; } //////////////////////////////////////////////////////////////////////////// diff --git a/src/DTRendererPlatform.h b/src/DTRendererPlatform.h index d44f37f..0a59005 100644 --- a/src/DTRendererPlatform.h +++ b/src/DTRendererPlatform.h @@ -53,6 +53,8 @@ typedef bool PlatformAPI_QueueAddJob (PlatformJobQueue *const queue, c typedef bool PlatformAPI_QueueTryExecuteNextJob(PlatformJobQueue *const queue); typedef bool PlatformAPI_QueueAllJobsComplete (PlatformJobQueue *const queue); +typedef u32 PlatformAPI_AtomicCompareSwap(u32 *volatile dest, u32 swapVal, u32 compareVal); + //////////////////////////////////////////////////////////////////////////////// // Platform Locks //////////////////////////////////////////////////////////////////////////////// @@ -77,6 +79,7 @@ typedef struct PlatformAPI PlatformAPI_QueueAddJob *QueueAddJob; PlatformAPI_QueueTryExecuteNextJob *QueueTryExecuteNextJob; PlatformAPI_QueueAllJobsComplete *QueueAllJobsComplete; + PlatformAPI_AtomicCompareSwap *AtomicCompareSwap; PlatformAPI_LockInit *LockInit; PlatformAPI_LockAcquire *LockAcquire; diff --git a/src/DTRendererRender.cpp b/src/DTRendererRender.cpp index d5bfc5d..02a3628 100644 --- a/src/DTRendererRender.cpp +++ b/src/DTRendererRender.cpp @@ -643,8 +643,9 @@ FILE_SCOPE inline void SIMDSetPixel(DTRRenderContext context, const i32 x, const const u32 pitchInU32 = (renderBuffer->width * renderBuffer->bytesPerPixel) / 4; u32 srcPixel = bitmapPtr[x + (y * pitchInU32)]; - __m128 src = _mm_set_ps(0, (f32)((srcPixel >> 0) & 0xFF), (f32)((srcPixel >> 8) & 0xFF), - (f32)((srcPixel >> 16) & 0xFF)); + __m128 src = _mm_set_ps(0, (f32)((srcPixel >> 0) & 0xFF), + (f32)((srcPixel >> 8) & 0xFF), + (f32)((srcPixel >> 16) & 0xFF)); src = SIMDSRGB255ToLinearSpace1(src); f32 invA = 1 - alpha; @@ -663,7 +664,9 @@ FILE_SCOPE inline void SIMDSetPixel(DTRRenderContext context, const i32 x, const f32 destB = ((f32 *)&dest)[2]; u32 pixel = // ((u32)(destA) << 24 | - (u32)(destR) << 16 | (u32)(destG) << 8 | (u32)(destB) << 0; + (u32)(destR) << 16 | + (u32)(destG) << 8 | + (u32)(destB) << 0; bitmapPtr[x + (y * pitchInU32)] = pixel; } @@ -814,21 +817,18 @@ FILE_SCOPE inline void SetCurrZDepth(DTRRenderContext context, i32 posX, i32 pos DTRDebug_EndCycleCount(DTRDebugCycleCount_SIMD##type); \ } while (0) -FILE_SCOPE void SIMDRasteriseTrianglePixel(DTRRenderContext context, - const DTRBitmap *const texture, const i32 posX, - const i32 posY, const i32 maxX, const DqnV2 uv1, - const DqnV2 uv2SubUv1, const DqnV2 uv3SubUv1, - const __m128 simdColor, const __m128 triangleZ, - const __m128 signedArea, - const __m128 invSignedAreaParallelogram_4x) +FILE_SCOPE void +SIMDRasteriseTrianglePixel(DTRRenderContext context, const DTRBitmap *const texture, const i32 posX, + const i32 posY, const i32 maxX, const DqnV2 uv1, const DqnV2 uv2SubUv1, + const DqnV2 uv3SubUv1, const __m128 simdColor, const __m128 triangleZ, + const __m128 signedArea, const __m128 invSignedAreaParallelogram_4x, + const f32 preserveAlpha, const bool ignoreLight, const __m128 p1Light, + const __m128 p2Light, const __m128 p3Light) { - const __m128 ZERO_4X = _mm_set_ps1(0.0f); - const u32 IS_GREATER_MASK = 0xF; - - DTRRenderBuffer *renderBuffer = context.renderBuffer; - - // TODO(doyle): Copy lighting work over. But not important since using this - // function causes performance problems. + DTRRenderBuffer *const renderBuffer = context.renderBuffer; + const __m128 ZERO_4X = _mm_set_ps1(0.0f); + const u32 IS_GREATER_MASK = 0xF; + const u32 zBufferPitch = renderBuffer->width; // Rasterise buffer(X, Y) pixel { @@ -838,36 +838,56 @@ FILE_SCOPE void SIMDRasteriseTrianglePixel(DTRRenderContext context, if ((isGreaterResult & IS_GREATER_MASK) == IS_GREATER_MASK && posX < maxX) { DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_RasterisePixel); - __m128 barycentric = _mm_mul_ps(signedArea, invSignedAreaParallelogram_4x); - __m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric); - - f32 pixelZDepth = - ((f32 *)&barycentricZ)[0] + ((f32 *)&barycentricZ)[1] + ((f32 *)&barycentricZ)[2]; - f32 currZDepth = GetCurrZDepth(context, posX, posY); - if (pixelZDepth > currZDepth) { - SetCurrZDepth(context, posX, posY, pixelZDepth); + __m128 barycentric = _mm_mul_ps(signedArea, invSignedAreaParallelogram_4x); + __m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric); - __m128 finalColor = simdColor; - if (texture) + f32 pixelZDepth = ((f32 *)&barycentricZ)[0] + ((f32 *)&barycentricZ)[1] + + ((f32 *)&barycentricZ)[2]; + + i32 zBufferIndex = posX + (posY * zBufferPitch); + if (pixelZDepth > renderBuffer->zBuffer[zBufferIndex]) { - __m128 texSampledColor = SIMDSampleTextureForTriangle(texture, uv1, uv2SubUv1, - uv3SubUv1, barycentric); - finalColor = _mm_mul_ps(texSampledColor, simdColor); + renderBuffer->zBuffer[zBufferIndex] = pixelZDepth; + __m128 finalColor = simdColor; + if (!ignoreLight) + { + __m128 barycentricA_4x = _mm_set_ps1(((f32 *)&barycentric)[0]); + __m128 barycentricB_4x = _mm_set_ps1(((f32 *)&barycentric)[1]); + __m128 barycentricC_4x = _mm_set_ps1(((f32 *)&barycentric)[2]); + + __m128 barycentricLight1 = _mm_mul_ps(p1Light, barycentricA_4x); + __m128 barycentricLight2 = _mm_mul_ps(p2Light, barycentricB_4x); + __m128 barycentricLight3 = _mm_mul_ps(p3Light, barycentricC_4x); + + __m128 light = _mm_add_ps(barycentricLight3, + _mm_add_ps(barycentricLight1, barycentricLight2)); + + finalColor = _mm_mul_ps(finalColor, light); + ((f32 *)&finalColor)[3] = preserveAlpha; + } + + if (texture) + { + __m128 texSampledColor = SIMDSampleTextureForTriangle( + texture, uv1, uv2SubUv1, uv3SubUv1, barycentric); + finalColor = _mm_mul_ps(texSampledColor, finalColor); + } + + SIMDSetPixel(context, posX, posY, finalColor, ColorSpace_Linear); } - SIMDSetPixel(context, posX, posY, finalColor, ColorSpace_Linear); } DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_RasterisePixel); } } } -FILE_SCOPE void SIMDTriangle(DTRRenderContext context, - const DqnV3 p1, const DqnV3 p2, const DqnV3 p3, const DqnV2 uv1, - const DqnV2 uv2, const DqnV2 uv3, const f32 lightIntensity1, - const f32 lightIntensity2, const f32 lightIntensity3, - const bool ignoreLight, DTRBitmap *const texture, DqnV4 color, - const DqnV2i min, const DqnV2i max) +FILE_SCOPE void SIMDTriangle(DTRRenderContext context, const DqnV3 p1, const DqnV3 p2, + const DqnV3 p3, const DqnV2 uv1, const DqnV2 uv2, const DqnV2 uv3, + const f32 lightIntensity1, const f32 lightIntensity2, + const f32 lightIntensity3, const bool ignoreLight, + DTRBitmap *const texture, DqnV4 color, const DqnV2i min, + const DqnV2i max) { DTR_DEBUG_EP_TIMED_FUNCTION(); @@ -900,12 +920,12 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context, //////////////////////////////////////////////////////////////////////////// // Setup SIMD data //////////////////////////////////////////////////////////////////////////// - const u32 NUM_X_PIXELS_TO_SIMD = 2; + const u32 NUM_X_PIXELS_TO_SIMD = 1; const u32 NUM_Y_PIXELS_TO_SIMD = 1; // SignedArea: _mm_set_ps(unused, p3, p2, p1) ie 0=p1, 1=p1, 2=p3, 3=unused __m128 signedAreaPixel1 = _mm_set_ps1(0); - __m128 signedAreaPixel2 = _mm_set_ps1(0); + // __m128 signedAreaPixel2 = _mm_set_ps1(0); __m128 signedAreaPixelDeltaX = _mm_set_ps1(0); __m128 signedAreaPixelDeltaY = _mm_set_ps1(0); @@ -942,7 +962,7 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context, signedAreaPixelDeltaY = _mm_set_ps(0, signedArea3DeltaY, signedArea2DeltaY, signedArea1DeltaY); signedAreaPixel1 = _mm_set_ps(0, signedArea3Start, signedArea2Start, signedArea1Start); - signedAreaPixel2 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaX); + // signedAreaPixel2 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaX); // NOTE: Increase step size to the number of pixels rasterised with SIMD { @@ -958,10 +978,10 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context, const DqnV2 uv2SubUv1 = uv2 - uv1; const DqnV2 uv3SubUv1 = uv3 - uv1; -#define UNROLL_LOOP 1 +#define INLINE_RASTERISE 1 DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Preamble); -#if UNROLL_LOOP +#if INLINE_RASTERISE const u32 IS_GREATER_MASK = 0xF; const u32 zBufferPitch = renderBuffer->width; #endif @@ -973,11 +993,11 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context, for (i32 bufferY = min.y; bufferY < max.y; bufferY += NUM_Y_PIXELS_TO_SIMD) { __m128 signedArea1 = signedAreaPixel1; - __m128 signedArea2 = signedAreaPixel2; + // __m128 signedArea2 = signedAreaPixel2; for (i32 bufferX = min.x; bufferX < max.x; bufferX += NUM_X_PIXELS_TO_SIMD) { -#if UNROLL_LOOP +#if INLINE_RASTERISE // Rasterise buffer(X, Y) pixel { __m128 checkArea = signedArea1; @@ -997,108 +1017,64 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context, ((f32 *)&barycentricZ)[2]; i32 zBufferIndex = posX + (posY * zBufferPitch); - context.api->LockAcquire(renderBuffer->renderLock); + __m128 finalColor = simdColor; + if (!ignoreLight) + { + __m128 barycentricA_4x = _mm_set_ps1(((f32 *)&barycentric)[0]); + __m128 barycentricB_4x = _mm_set_ps1(((f32 *)&barycentric)[1]); + __m128 barycentricC_4x = _mm_set_ps1(((f32 *)&barycentric)[2]); + + __m128 barycentricLight1 = _mm_mul_ps(p1Light, barycentricA_4x); + __m128 barycentricLight2 = _mm_mul_ps(p2Light, barycentricB_4x); + __m128 barycentricLight3 = _mm_mul_ps(p3Light, barycentricC_4x); + + __m128 light = _mm_add_ps(barycentricLight3, + _mm_add_ps(barycentricLight1, barycentricLight2)); + + finalColor = _mm_mul_ps(finalColor, light); + ((f32 *)&finalColor)[3] = preserveAlpha; + } + + if (texture) + { + __m128 texSampledColor = SIMDSampleTextureForTriangle( + texture, uv1, uv2SubUv1, uv3SubUv1, barycentric); + finalColor = _mm_mul_ps(texSampledColor, finalColor); + } + +#if 1 + bool currLockValue; + do + { + currLockValue = (bool)context.api->AtomicCompareSwap( + (u32 *)&renderBuffer->pixelLockTable[zBufferIndex], (u32)true, (u32)false); + } while (currLockValue != false); +#endif + if (pixelZDepth > renderBuffer->zBuffer[zBufferIndex]) { renderBuffer->zBuffer[zBufferIndex] = pixelZDepth; - - __m128 finalColor = simdColor; - if (!ignoreLight) - { - __m128 barycentricA_4x = _mm_set_ps1(((f32 *)&barycentric)[0]); - __m128 barycentricB_4x = _mm_set_ps1(((f32 *)&barycentric)[1]); - __m128 barycentricC_4x = _mm_set_ps1(((f32 *)&barycentric)[2]); - - __m128 barycentricLight1 = _mm_mul_ps(p1Light, barycentricA_4x); - __m128 barycentricLight2 = _mm_mul_ps(p2Light, barycentricB_4x); - __m128 barycentricLight3 = _mm_mul_ps(p3Light, barycentricC_4x); - - __m128 light = - _mm_add_ps(barycentricLight3, - _mm_add_ps(barycentricLight1, barycentricLight2)); - - finalColor = _mm_mul_ps(finalColor, light); - ((f32 *)&finalColor)[3] = preserveAlpha; - } - - if (texture) - { - __m128 texSampledColor = SIMDSampleTextureForTriangle(texture, uv1, uv2SubUv1, uv3SubUv1, barycentric); - finalColor = _mm_mul_ps(texSampledColor, finalColor); - } SIMDSetPixel(context, posX, posY, finalColor, ColorSpace_Linear); } - context.api->LockRelease(renderBuffer->renderLock); + renderBuffer->pixelLockTable[zBufferIndex] = false; DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_RasterisePixel); } signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX); } - // Rasterise buffer(X + 1, Y) pixel - { - __m128 checkArea = signedArea2; - __m128 isGreater = _mm_cmpge_ps(checkArea, ZERO_4X); - i32 isGreaterResult = _mm_movemask_ps(isGreater); - i32 posX = bufferX + 1; - i32 posY = bufferY; - if ((isGreaterResult & IS_GREATER_MASK) == IS_GREATER_MASK && posX < max.x) - { - __m128 barycentric = _mm_mul_ps(checkArea, invSignedAreaParallelogram_4x); - __m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric); - - f32 pixelZDepth = ((f32 *)&barycentricZ)[0] + - ((f32 *)&barycentricZ)[1] + - ((f32 *)&barycentricZ)[2]; - i32 zBufferIndex = posX + (posY * zBufferPitch); - context.api->LockAcquire(renderBuffer->renderLock); - if (pixelZDepth > renderBuffer->zBuffer[zBufferIndex]) - { - renderBuffer->zBuffer[zBufferIndex] = pixelZDepth; - - __m128 finalColor = simdColor; - if (!ignoreLight) - { - __m128 barycentricA_4x = _mm_set_ps1(((f32 *)&barycentric)[0]); - __m128 barycentricB_4x = _mm_set_ps1(((f32 *)&barycentric)[1]); - __m128 barycentricC_4x = _mm_set_ps1(((f32 *)&barycentric)[2]); - - __m128 barycentricLight1 = _mm_mul_ps(p1Light, barycentricA_4x); - __m128 barycentricLight2 = _mm_mul_ps(p2Light, barycentricB_4x); - __m128 barycentricLight3 = _mm_mul_ps(p3Light, barycentricC_4x); - - __m128 light = - _mm_add_ps(barycentricLight3, - _mm_add_ps(barycentricLight1, barycentricLight2)); - - finalColor = _mm_mul_ps(finalColor, light); - ((f32 *)&finalColor)[3] = preserveAlpha; - } - - if (texture) - { - __m128 texSampledColor = SIMDSampleTextureForTriangle(texture, uv1, uv2SubUv1, uv3SubUv1, barycentric); - finalColor = _mm_mul_ps(texSampledColor, finalColor); - } - SIMDSetPixel(context, posX, posY, finalColor, ColorSpace_Linear); - } - context.api->LockRelease(renderBuffer->renderLock); - } - signedArea2 = _mm_add_ps(signedArea2, signedAreaPixelDeltaX); - } #else - SIMDRasteriseTrianglePixel(renderBuffer, texture, bufferX, bufferY, max.x, uv1, uv2SubUv1, + SIMDRasteriseTrianglePixel(context, texture, bufferX, bufferY, max.x, uv1, uv2SubUv1, uv3SubUv1, simdColor, triangleZ, signedArea1, - invSignedAreaParallelogram_4x); - SIMDRasteriseTrianglePixel(renderBuffer, texture, bufferX + 1, bufferY, max.x, uv1, uv2SubUv1, - uv3SubUv1, simdColor, triangleZ, signedArea2, - invSignedAreaParallelogram_4x); + invSignedAreaParallelogram_4x, preserveAlpha, ignoreLight, + p1Light, p2Light, p3Light); signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX); - signedArea2 = _mm_add_ps(signedArea2, signedAreaPixelDeltaX); + // signedArea2 = _mm_add_ps(signedArea2, signedAreaPixelDeltaX); #endif } signedAreaPixel1 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaY); - signedAreaPixel2 = _mm_add_ps(signedAreaPixel2, signedAreaPixelDeltaY); + // signedAreaPixel2 = _mm_add_ps(signedAreaPixel2, signedAreaPixelDeltaY); } + DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Rasterise); DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle); } diff --git a/src/DTRendererRender.h b/src/DTRendererRender.h index 81d1c21..cd7fc12 100644 --- a/src/DTRendererRender.h +++ b/src/DTRendererRender.h @@ -20,6 +20,8 @@ typedef struct DTRRenderBuffer volatile u8 *memory; // Format: XX RR GG BB, and has (width * height * bytesPerPixels) elements volatile f32 *zBuffer; // zBuffer has (width * height) elements + volatile bool *pixelLockTable; // has (width * height) elements + } DTRRenderBuffer; // Using transforms for 2D ignores the 'z' element. @@ -76,9 +78,10 @@ typedef struct DTRRenderLight typedef struct DTRRenderContext { - DTRRenderBuffer *renderBuffer; - DqnMemStack *tempStack; - PlatformAPI *api; + DTRRenderBuffer *renderBuffer; + DqnMemStack *tempStack; + PlatformAPI *api; + PlatformJobQueue *jobQueue; } DTRRenderContext; // NOTE: All colors should be in the range of [0->1] where DqnV4 is a struct with 4 floats, rgba diff --git a/src/Win32DTRenderer.cpp b/src/Win32DTRenderer.cpp index b1b087f..3bc5fb6 100644 --- a/src/Win32DTRenderer.cpp +++ b/src/Win32DTRenderer.cpp @@ -8,6 +8,18 @@ #define UNICODE #define _UNICODE +//////////////////////////////////////////////////////////////////////////////// +// Platform Atomics +//////////////////////////////////////////////////////////////////////////////// +u32 Platform_AtomicCompareSwap(u32 *volatile dest, u32 swapVal, u32 compareVal) +{ + // TODO(doyle): Compile time assert + DQN_ASSERT(sizeof(LONG) == sizeof(u32)); + u32 result = + (u32)InterlockedCompareExchange((LONG volatile *)dest, (LONG)swapVal, (LONG)compareVal); + return result; +} + //////////////////////////////////////////////////////////////////////////////// // Platform Mutex/Lock //////////////////////////////////////////////////////////////////////////////// @@ -708,6 +720,8 @@ int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPWSTR lpCmdLi platformAPI.QueueTryExecuteNextJob = Platform_QueueTryExecuteNextJob; platformAPI.QueueAllJobsComplete = Platform_QueueAllJobsComplete; + platformAPI.AtomicCompareSwap = Platform_AtomicCompareSwap; + platformAPI.LockInit = Platform_LockInit; platformAPI.LockAcquire = Platform_LockAcquire; platformAPI.LockRelease = Platform_LockRelease; @@ -903,8 +917,7 @@ int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPWSTR lpCmdLi if (dllCode.DTR_Update) { - dllCode.DTR_Update(&platformBuffer, &platformInput, - &globalPlatformMemory); + dllCode.DTR_Update(&platformBuffer, &platformInput, &globalPlatformMemory); } }