From 5397cdd9b961e1e41ae26c4a6f84e8becee79f56 Mon Sep 17 00:00:00 2001 From: Doyle Thai Date: Tue, 20 Jun 2017 02:13:03 +1000 Subject: [PATCH] Better assert(), move some Win32 functions to DQN Asserts are now more defensive and diagnostic. Initially asserts provide a hard break in the program, and the intention is to disable asserts for release. Having experienced bugs in release mode due to optimisations with asserts off there's little protection against errors that occur in release mode since all asserts are off. Now asserts can be used in release mode whilst still evaluating the expression, and allowing user messages/diagnostics to console. --- src/DTRenderer.cpp | 1 + src/DTRendererRender.cpp | 296 +++++++++++++++++++++++++++++++-------- src/DTRendererRender.h | 2 + src/Win32DTRenderer.cpp | 144 +++++-------------- src/build.bat | 1 + src/dqn.h | 161 ++++++++++++++++++++- 6 files changed, 431 insertions(+), 174 deletions(-) diff --git a/src/DTRenderer.cpp b/src/DTRenderer.cpp index 405d8cb..c1cc157 100644 --- a/src/DTRenderer.cpp +++ b/src/DTRenderer.cpp @@ -976,6 +976,7 @@ extern "C" void DTR_Update(PlatformRenderBuffer *const platformRenderBuffer, } DTRRenderContext renderContext = {}; + renderContext.multithread = true; renderContext.renderBuffer = &renderBuffer; renderContext.tempStack = &memory->tempStack; renderContext.api = &input->api; diff --git a/src/DTRendererRender.cpp b/src/DTRendererRender.cpp index 02a3628..4a3e072 100644 --- a/src/DTRendererRender.cpp +++ b/src/DTRendererRender.cpp @@ -978,13 +978,9 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context, const DqnV3 p1, const Dqn const DqnV2 uv2SubUv1 = uv2 - uv1; const DqnV2 uv3SubUv1 = uv3 - uv1; -#define INLINE_RASTERISE 1 DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Preamble); - -#if INLINE_RASTERISE const u32 IS_GREATER_MASK = 0xF; const u32 zBufferPitch = renderBuffer->width; -#endif //////////////////////////////////////////////////////////////////////////// // Scan and Render @@ -997,7 +993,6 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context, const DqnV3 p1, const Dqn for (i32 bufferX = min.x; bufferX < max.x; bufferX += NUM_X_PIXELS_TO_SIMD) { -#if INLINE_RASTERISE // Rasterise buffer(X, Y) pixel { __m128 checkArea = signedArea1; @@ -1017,43 +1012,46 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context, const DqnV3 p1, const Dqn ((f32 *)&barycentricZ)[2]; i32 zBufferIndex = posX + (posY * zBufferPitch); - __m128 finalColor = simdColor; - if (!ignoreLight) + if (context.multithread) { - __m128 barycentricA_4x = _mm_set_ps1(((f32 *)&barycentric)[0]); - __m128 barycentricB_4x = _mm_set_ps1(((f32 *)&barycentric)[1]); - __m128 barycentricC_4x = _mm_set_ps1(((f32 *)&barycentric)[2]); - - __m128 barycentricLight1 = _mm_mul_ps(p1Light, barycentricA_4x); - __m128 barycentricLight2 = _mm_mul_ps(p2Light, barycentricB_4x); - __m128 barycentricLight3 = _mm_mul_ps(p3Light, barycentricC_4x); - - __m128 light = _mm_add_ps(barycentricLight3, - _mm_add_ps(barycentricLight1, barycentricLight2)); - - finalColor = _mm_mul_ps(finalColor, light); - ((f32 *)&finalColor)[3] = preserveAlpha; + bool currLockValue; + do + { + currLockValue = (bool)context.api->AtomicCompareSwap( + (u32 *)&renderBuffer->pixelLockTable[zBufferIndex], (u32) true, + (u32) false); + } while (currLockValue != false); } - if (texture) - { - __m128 texSampledColor = SIMDSampleTextureForTriangle( - texture, uv1, uv2SubUv1, uv3SubUv1, barycentric); - finalColor = _mm_mul_ps(texSampledColor, finalColor); - } - -#if 1 - bool currLockValue; - do - { - currLockValue = (bool)context.api->AtomicCompareSwap( - (u32 *)&renderBuffer->pixelLockTable[zBufferIndex], (u32)true, (u32)false); - } while (currLockValue != false); -#endif - if (pixelZDepth > renderBuffer->zBuffer[zBufferIndex]) { + __m128 finalColor = simdColor; renderBuffer->zBuffer[zBufferIndex] = pixelZDepth; + if (!ignoreLight) + { + __m128 barycentricA_4x = _mm_set_ps1(((f32 *)&barycentric)[0]); + __m128 barycentricB_4x = _mm_set_ps1(((f32 *)&barycentric)[1]); + __m128 barycentricC_4x = _mm_set_ps1(((f32 *)&barycentric)[2]); + + __m128 barycentricLight1 = _mm_mul_ps(p1Light, barycentricA_4x); + __m128 barycentricLight2 = _mm_mul_ps(p2Light, barycentricB_4x); + __m128 barycentricLight3 = _mm_mul_ps(p3Light, barycentricC_4x); + + __m128 light = + _mm_add_ps(barycentricLight3, + _mm_add_ps(barycentricLight1, barycentricLight2)); + + finalColor = _mm_mul_ps(finalColor, light); + ((f32 *)&finalColor)[3] = preserveAlpha; + } + + if (texture) + { + __m128 texSampledColor = SIMDSampleTextureForTriangle( + texture, uv1, uv2SubUv1, uv3SubUv1, barycentric); + finalColor = _mm_mul_ps(texSampledColor, finalColor); + } + SIMDSetPixel(context, posX, posY, finalColor, ColorSpace_Linear); } renderBuffer->pixelLockTable[zBufferIndex] = false; @@ -1061,15 +1059,192 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context, const DqnV3 p1, const Dqn } signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX); } + } + signedAreaPixel1 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaY); + // signedAreaPixel2 = _mm_add_ps(signedAreaPixel2, signedAreaPixelDeltaY); + } -#else - SIMDRasteriseTrianglePixel(context, texture, bufferX, bufferY, max.x, uv1, uv2SubUv1, - uv3SubUv1, simdColor, triangleZ, signedArea1, - invSignedAreaParallelogram_4x, preserveAlpha, ignoreLight, - p1Light, p2Light, p3Light); - signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX); - // signedArea2 = _mm_add_ps(signedArea2, signedAreaPixelDeltaX); -#endif + DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Rasterise); + DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle); +} + +FILE_SCOPE void SIMDBetterTriangle(DTRRenderContext context, const DqnV3 p1, const DqnV3 p2, + const DqnV3 p3, const DqnV2 uv1, const DqnV2 uv2, + const DqnV2 uv3, const f32 lightIntensity1, + const f32 lightIntensity2, const f32 lightIntensity3, + const bool ignoreLight, DTRBitmap *const texture, DqnV4 color, + const DqnV2i min, const DqnV2i max) + +{ + DTR_DEBUG_EP_TIMED_FUNCTION(); + DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle); + + DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_Preamble); + + DTRRenderBuffer *const renderBuffer = context.renderBuffer; + //////////////////////////////////////////////////////////////////////////// + // Convert color + //////////////////////////////////////////////////////////////////////////// + __m128 simdColor = _mm_set_ps(color.a, color.b, color.g, color.r); + simdColor = SIMDSRGB1ToLinearSpace(simdColor); + simdColor = SIMDPreMultiplyAlpha1(simdColor); + f32 preserveAlpha = ((f32 *)&simdColor)[3]; + + const __m128 ZERO_4X = _mm_set_ps1(0.0f); + __m128 simdLightIntensity1 = _mm_set_ps1(lightIntensity1); + __m128 simdLightIntensity2 = _mm_set_ps1(lightIntensity2); + __m128 simdLightIntensity3 = _mm_set_ps1(lightIntensity3); + + simdLightIntensity1 = _mm_max_ps(simdLightIntensity1, ZERO_4X); + simdLightIntensity2 = _mm_max_ps(simdLightIntensity2, ZERO_4X); + simdLightIntensity3 = _mm_max_ps(simdLightIntensity3, ZERO_4X); + + __m128 p1Light = _mm_mul_ps(simdColor, simdLightIntensity1); + __m128 p2Light = _mm_mul_ps(simdColor, simdLightIntensity2); + __m128 p3Light = _mm_mul_ps(simdColor, simdLightIntensity3); + + //////////////////////////////////////////////////////////////////////////// + // Setup SIMD data + //////////////////////////////////////////////////////////////////////////// + const u32 NUM_X_PIXELS_TO_SIMD = 1; + const u32 NUM_Y_PIXELS_TO_SIMD = 1; + + // SignedArea: _mm_set_ps(unused, p3, p2, p1) ie 0=p1, 1=p1, 2=p3, 3=unused + __m128 signedAreaPixel1 = _mm_set_ps1(0); + // __m128 signedAreaPixel2 = _mm_set_ps1(0); + + __m128 signedAreaPixelDeltaX = _mm_set_ps1(0); + __m128 signedAreaPixelDeltaY = _mm_set_ps1(0); + __m128 invSignedAreaParallelogram_4x = _mm_set_ps1(0); + + __m128 triangleZ = _mm_set_ps(0, p3.z, p2.z, p1.z); + { + DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_Preamble_SArea); + DTR_DEBUG_EP_TIMED_BLOCK("SIMDTriangle_Preamble_SArea"); + DqnV2 startP = DqnV2_V2i(min); + f32 signedArea1Start = Triangle2TimesSignedArea(p2.xy, p3.xy, startP); + f32 signedArea1DeltaX = p2.y - p3.y; + f32 signedArea1DeltaY = p3.x - p2.x; + + f32 signedArea2Start = Triangle2TimesSignedArea(p3.xy, p1.xy, startP); + f32 signedArea2DeltaX = p3.y - p1.y; + f32 signedArea2DeltaY = p1.x - p3.x; + + f32 signedArea3Start = Triangle2TimesSignedArea(p1.xy, p2.xy, startP); + f32 signedArea3DeltaX = p1.y - p2.y; + f32 signedArea3DeltaY = p2.x - p1.x; + DTR_DEBUG_EP_TIMED_END_BLOCK(); + DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Preamble_SArea); + + DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_Preamble_SIMDStep); + f32 signedAreaParallelogram = signedArea1Start + signedArea2Start + signedArea3Start; + if (signedAreaParallelogram == 0) return; + + f32 invSignedAreaParallelogram = 1.0f / signedAreaParallelogram; + invSignedAreaParallelogram_4x = _mm_set_ps1(invSignedAreaParallelogram); + + // NOTE: Order is important here! + signedAreaPixelDeltaX = _mm_set_ps(0, signedArea3DeltaX, signedArea2DeltaX, signedArea1DeltaX); + signedAreaPixelDeltaY = _mm_set_ps(0, signedArea3DeltaY, signedArea2DeltaY, signedArea1DeltaY); + + signedAreaPixel1 = _mm_set_ps(0, signedArea3Start, signedArea2Start, signedArea1Start); + // signedAreaPixel2 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaX); + + // NOTE: Increase step size to the number of pixels rasterised with SIMD + { + const __m128 STEP_X_4X = _mm_set_ps1((f32)NUM_X_PIXELS_TO_SIMD); + const __m128 STEP_Y_4X = _mm_set_ps1((f32)NUM_Y_PIXELS_TO_SIMD); + + signedAreaPixelDeltaX = _mm_mul_ps(signedAreaPixelDeltaX, STEP_X_4X); + signedAreaPixelDeltaY = _mm_mul_ps(signedAreaPixelDeltaY, STEP_Y_4X); + } + DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Preamble_SIMDStep); + } + + const DqnV2 uv2SubUv1 = uv2 - uv1; + const DqnV2 uv3SubUv1 = uv3 - uv1; + + DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Preamble); + const u32 IS_GREATER_MASK = 0xF; + const u32 zBufferPitch = renderBuffer->width; + + //////////////////////////////////////////////////////////////////////////// + // Scan and Render + //////////////////////////////////////////////////////////////////////////// + DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_Rasterise); + for (i32 bufferY = min.y; bufferY < max.y; bufferY += NUM_Y_PIXELS_TO_SIMD) + { + __m128 signedArea1 = signedAreaPixel1; + // __m128 signedArea2 = signedAreaPixel2; + + for (i32 bufferX = min.x; bufferX < max.x; bufferX += NUM_X_PIXELS_TO_SIMD) + { + // Rasterise buffer(X, Y) pixel + { + __m128 checkArea = signedArea1; + __m128 isGreater = _mm_cmpge_ps(checkArea, ZERO_4X); + i32 isGreaterResult = _mm_movemask_ps(isGreater); + i32 posX = bufferX; + i32 posY = bufferY; + + if ((isGreaterResult & IS_GREATER_MASK) == IS_GREATER_MASK) + { + DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_RasterisePixel); + __m128 barycentric = _mm_mul_ps(checkArea, invSignedAreaParallelogram_4x); + __m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric); + + f32 pixelZDepth = ((f32 *)&barycentricZ)[0] + + ((f32 *)&barycentricZ)[1] + + ((f32 *)&barycentricZ)[2]; + + i32 zBufferIndex = posX + (posY * zBufferPitch); + if (context.multithread) + { + bool currLockValue; + do + { + currLockValue = (bool)context.api->AtomicCompareSwap( + (u32 *)&renderBuffer->pixelLockTable[zBufferIndex], (u32) true, + (u32) false); + } while (currLockValue != false); + } + + if (pixelZDepth > renderBuffer->zBuffer[zBufferIndex]) + { + __m128 finalColor = simdColor; + renderBuffer->zBuffer[zBufferIndex] = pixelZDepth; + if (!ignoreLight) + { + __m128 barycentricA_4x = _mm_set_ps1(((f32 *)&barycentric)[0]); + __m128 barycentricB_4x = _mm_set_ps1(((f32 *)&barycentric)[1]); + __m128 barycentricC_4x = _mm_set_ps1(((f32 *)&barycentric)[2]); + + __m128 barycentricLight1 = _mm_mul_ps(p1Light, barycentricA_4x); + __m128 barycentricLight2 = _mm_mul_ps(p2Light, barycentricB_4x); + __m128 barycentricLight3 = _mm_mul_ps(p3Light, barycentricC_4x); + + __m128 light = + _mm_add_ps(barycentricLight3, + _mm_add_ps(barycentricLight1, barycentricLight2)); + + finalColor = _mm_mul_ps(finalColor, light); + ((f32 *)&finalColor)[3] = preserveAlpha; + } + + if (texture) + { + __m128 texSampledColor = SIMDSampleTextureForTriangle( + texture, uv1, uv2SubUv1, uv3SubUv1, barycentric); + finalColor = _mm_mul_ps(texSampledColor, finalColor); + } + + SIMDSetPixel(context, posX, posY, finalColor, ColorSpace_Linear); + } + renderBuffer->pixelLockTable[zBufferIndex] = false; + DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_RasterisePixel); + } + signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX); + } } signedAreaPixel1 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaY); // signedAreaPixel2 = _mm_add_ps(signedAreaPixel2, signedAreaPixelDeltaY); @@ -1169,13 +1344,24 @@ FILE_SCOPE void SlowTriangle(DTRRenderContext context, const DqnV3 p1, const Dqn f32 barycentricB = signedArea2 * invSignedAreaParallelogram; f32 barycentricC = signedArea3 * invSignedAreaParallelogram; - f32 pixelZDepth = p1.z + (barycentricB * (p2SubP1.z)) + (barycentricC * (p3SubP1.z)); - f32 currZDepth = GetCurrZDepth(context, bufferX, bufferY); - - if (pixelZDepth > currZDepth) + i32 zBufferIndex = bufferX + (bufferY * zBufferPitch); + if (context.multithread) { - SetCurrZDepth(context, bufferX, bufferY, pixelZDepth); - DqnV4 finalColor = color; + bool currLockValue; + do + { + currLockValue = (bool)context.api->AtomicCompareSwap( + (u32 *)&renderBuffer->pixelLockTable[zBufferIndex], (u32) true, + (u32) false); + } while (currLockValue != false); + } + + f32 pixelZDepth = + p1.z + (barycentricB * (p2SubP1.z)) + (barycentricC * (p3SubP1.z)); + if (pixelZDepth > renderBuffer->zBuffer[zBufferIndex]) + { + DqnV4 finalColor = color; + renderBuffer->zBuffer[zBufferIndex] = pixelZDepth; if (!ignoreLight) { @@ -1218,6 +1404,7 @@ FILE_SCOPE void SlowTriangle(DTRRenderContext context, const DqnV3 p1, const Dqn SetPixel(context, bufferX, bufferY, finalColor, ColorSpace_Linear); } + renderBuffer->pixelLockTable[zBufferIndex] = false; DEBUG_SLOW_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_RasterisePixel); } @@ -1429,7 +1616,6 @@ void DTRRender_Mesh(DTRRenderContext context, PlatformJobQueue *const jobQueue, viewPModelViewProjection = DqnMat4_Mul(viewport, modelViewProjection); } - bool RUN_MULTITHREADED = true; for (u32 i = 0; i < mesh->numFaces; i++) { DTRMeshFace face = mesh->faces[i]; @@ -1512,7 +1698,7 @@ void DTRRender_Mesh(DTRRenderContext context, PlatformJobQueue *const jobQueue, lightingInternal.numNormals = 3; bool DEBUG_NO_TEX = false; - if (RUN_MULTITHREADED) + if (context.multithread) { RenderMeshJob *jobData = (RenderMeshJob *)DqnMemStack_Push(tempStack, sizeof(*jobData)); if (jobData) @@ -1575,10 +1761,10 @@ void DTRRender_Mesh(DTRRenderContext context, PlatformJobQueue *const jobQueue, } } - // NOTE(doyle): Complete remaining jobs and wait until all jobs finished - // before leaving function. - if (RUN_MULTITHREADED) + if (context.multithread) { + // NOTE(doyle): Complete remaining jobs and wait until all jobs finished + // before leaving function. while (api->QueueTryExecuteNextJob(jobQueue) || !api->QueueAllJobsComplete(jobQueue)) ; } diff --git a/src/DTRendererRender.h b/src/DTRendererRender.h index cd7fc12..b0f6831 100644 --- a/src/DTRendererRender.h +++ b/src/DTRendererRender.h @@ -82,6 +82,8 @@ typedef struct DTRRenderContext DqnMemStack *tempStack; PlatformAPI *api; PlatformJobQueue *jobQueue; + + bool multithread; } DTRRenderContext; // NOTE: All colors should be in the range of [0->1] where DqnV4 is a struct with 4 floats, rgba diff --git a/src/Win32DTRenderer.cpp b/src/Win32DTRenderer.cpp index 3bc5fb6..3415d8c 100644 --- a/src/Win32DTRenderer.cpp +++ b/src/Win32DTRenderer.cpp @@ -570,55 +570,25 @@ FILE_SCOPE void Win32ProcessMessages(HWND window, PlatformInput *input) } } -// Return the index of the last slash -i32 Win32GetModuleDirectory(char *const buf, const u32 bufLen) -{ - if (!buf || bufLen == 0) return 0; - u32 copiedLen = GetModuleFileName(NULL, buf, bufLen); - if (copiedLen == bufLen) - { - DQN_WIN32_ERROR_BOX( - "GetModuleFileName() buffer maxed: Len of copied text is len " - "of supplied buffer.", - NULL); - DQN_ASSERT(DQN_INVALID_CODE_PATH); - } - - // NOTE: Should always work if GetModuleFileName works and we're running an - // executable. - i32 lastSlashIndex = 0; - for (i32 i = copiedLen; i > 0; i--) - { - if (buf[i] == '\\') - { - lastSlashIndex = i; - break; - } - } - - return lastSlashIndex; -} - int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPWSTR lpCmdLine, int nShowCmd) { //////////////////////////////////////////////////////////////////////////// // Initialise Win32 Window //////////////////////////////////////////////////////////////////////////// - WNDCLASSEXW wc = - { - sizeof(WNDCLASSEX), - CS_HREDRAW | CS_VREDRAW | CS_OWNDC, - Win32MainProcCallback, - 0, // int cbClsExtra - 0, // int cbWndExtra - hInstance, - LoadIcon(NULL, IDI_APPLICATION), - LoadCursor(NULL, IDC_ARROW), - GetSysColorBrush(COLOR_3DFACE), - L"", // LPCTSTR lpszMenuName - L"DRendererClass", - NULL, // HICON hIconSm + WNDCLASSEXW wc = { + sizeof(WNDCLASSEX), + CS_HREDRAW | CS_VREDRAW | CS_OWNDC, + Win32MainProcCallback, + 0, // int cbClsExtra + 0, // int cbWndExtra + hInstance, + LoadIcon(NULL, IDI_APPLICATION), + LoadCursor(NULL, IDC_ARROW), + GetSysColorBrush(COLOR_3DFACE), + L"", // LPCTSTR lpszMenuName + L"DRendererClass", + NULL, // HICON hIconSm }; if (!RegisterClassExW(&wc)) @@ -665,7 +635,7 @@ int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPWSTR lpCmdLi globalRenderBitmap.width = header.biWidth; globalRenderBitmap.height = header.biHeight; globalRenderBitmap.bytesPerPixel = header.biBitCount / 8; - DQN_ASSERT(globalRenderBitmap.bytesPerPixel >= 1); + if (!DQN_ASSERT(globalRenderBitmap.bytesPerPixel >= 1)) return -1; HDC deviceContext = GetDC(mainWindow); globalRenderBitmap.handle = CreateDIBSection( @@ -688,17 +658,22 @@ int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPWSTR lpCmdLi char dllTmpPath[MAX_PATH] = {}; { char exeDir[MAX_PATH] = {}; - i32 lastSlashIndex = - Win32GetModuleDirectory(exeDir, DQN_ARRAY_COUNT(exeDir)); - DQN_ASSERT(lastSlashIndex + 1 < DQN_ARRAY_COUNT(exeDir)); + i32 lastSlashIndex = DqnWin32_GetEXEDirectory(exeDir, DQN_ARRAY_COUNT(exeDir)); + if (lastSlashIndex != -1) + { + DQN_ASSERT(lastSlashIndex + 1 < DQN_ARRAY_COUNT(exeDir)); - exeDir[lastSlashIndex + 1] = 0; - u32 numCopied = Dqn_sprintf(dllPath, "%s%s", exeDir, DLL_NAME); - DQN_ASSERT(numCopied < DQN_ARRAY_COUNT(dllPath)); + exeDir[lastSlashIndex + 1] = 0; + u32 numCopied = Dqn_sprintf(dllPath, "%s%s", exeDir, DLL_NAME); + DQN_ASSERT(numCopied < DQN_ARRAY_COUNT(dllPath)); - numCopied = - Dqn_sprintf(dllTmpPath, "%s%s", exeDir, DLL_TMP_NAME); - DQN_ASSERT(numCopied < DQN_ARRAY_COUNT(dllTmpPath)); + numCopied = Dqn_sprintf(dllTmpPath, "%s%s", exeDir, DLL_TMP_NAME); + DQN_ASSERT(numCopied < DQN_ARRAY_COUNT(dllTmpPath)); + } + else + { + DQN_ASSERT(DQN_INVALID_CODE_PATH); + } } //////////////////////////////////////////////////////////////////////////// @@ -748,62 +723,8 @@ int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPWSTR lpCmdLi //////////////////////////////////////////////////////////////////////// // Query CPU Cores //////////////////////////////////////////////////////////////////////// - i32 numCores = 0; - i32 numLogicalCores = 0; - - SYSTEM_INFO systemInfo; - GetNativeSystemInfo(&systemInfo); - DqnWin32_OutputDebugString("Number of Logical Processors: %d\n", - systemInfo.dwNumberOfProcessors); - numLogicalCores = systemInfo.dwNumberOfProcessors; - - DWORD logicalProcInfoRequiredSize = 0; - u8 insufficientBuffer = {}; - GetLogicalProcessorInformationEx( - RelationProcessorCore, (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)insufficientBuffer, - &logicalProcInfoRequiredSize); - - u8 *rawProcInfoArray = - (u8 *)DqnMemStack_Push(&globalPlatformMemory.tempStack, logicalProcInfoRequiredSize); - - if (rawProcInfoArray) - { - if (GetLogicalProcessorInformationEx( - RelationProcessorCore, - (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)rawProcInfoArray, - &logicalProcInfoRequiredSize)) - { - SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *logicalProcInfo = - (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)rawProcInfoArray; - DWORD bytesRead = 0; - - do - { - // NOTE: High efficiency value has greater performance and less efficiency. - PROCESSOR_RELATIONSHIP *procInfo = &logicalProcInfo->Processor; - u32 efficiency = procInfo->EfficiencyClass; - DqnWin32_OutputDebugString("Core %d: Efficiency: %d\n", numCores++, efficiency); - - DQN_ASSERT(logicalProcInfo->Relationship == RelationProcessorCore); - DQN_ASSERT(procInfo->GroupCount == 1); - - bytesRead += logicalProcInfo->Size; - logicalProcInfo = - (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((u8 *)logicalProcInfo + - logicalProcInfo->Size); - - } while (bytesRead < logicalProcInfoRequiredSize); - } - else - { - DqnWin32_DisplayLastError("GetLogicalProcessorInformationEx() failed"); - } - } - else - { - DQN_WIN32_ERROR_BOX("DqnMemStack_Push() failed", NULL); - } - DqnMemStackTempRegion_End(memRegion); + i32 numCores, numThreadsPerCore; + DqnWin32_GetNumThreadsAndCores(&numCores, &numThreadsPerCore); //////////////////////////////////////////////////////////////////////// // Threading @@ -815,11 +736,10 @@ int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPWSTR lpCmdLi DQN_ASSERT(((size_t)&jobQueue.jobToExecuteIndex) % 4 == 0); // NOTE: (numCores - 1), 1 core is already exclusively for main thread - i32 availableThreads = (numCores - 1) * numLogicalCores; - + i32 availableThreads = (numCores - 1) * numThreadsPerCore; // TODO(doyle): Logic for single core/thread processors DQN_ASSERT(availableThreads > 0); - + jobQueue.win32Semaphore = CreateSemaphore(NULL, 0, availableThreads, NULL); if (jobQueue.win32Semaphore) { diff --git a/src/build.bat b/src/build.bat index c30124b..69b8174 100644 --- a/src/build.bat +++ b/src/build.bat @@ -78,6 +78,7 @@ set TimeStamp=%date:~10,4%%date:~7,2%%date:~4,2%_%CleanTime:~0,2%%CleanTime:~3,2 del *.pdb >NUL 2>NUL cl %CompileFlags% %Win32Flags% ..\src\Win32DTRenderer.cpp /link %LinkLibraries% %LinkFlags% +REM cl /P ..\src\Win32DTRenderer.cpp REM cl %CompileFlags% %DLLFlags% ..\src\UnityBuild\UnityBuild.cpp /LD /link ..\src\external\easy\easy_profiler.lib /PDB:%ProjectName%_%TimeStamp%.pdb /export:DTR_Update %LinkFlags% cl %CompileFlags% %DLLFlags% ..\src\UnityBuild\UnityBuild.cpp /LD /link /PDB:%ProjectName%_%TimeStamp%.pdb /export:DTR_Update %LinkFlags% diff --git a/src/dqn.h b/src/dqn.h index 4a70e7f..b0e3bec 100644 --- a/src/dqn.h +++ b/src/dqn.h @@ -52,7 +52,6 @@ typedef float f32; #define DQN_INVALID_CODE_PATH 0 #define DQN_ARRAY_COUNT(array) (sizeof(array) / sizeof(array[0])) -#define DQN_ASSERT(expr) if (!(expr)) { (*((i32 *)0)) = 0; } #define DQN_PI 3.14159265359f #define DQN_SQUARED(x) ((x) * (x)) @@ -63,6 +62,18 @@ typedef float f32; #define DQN_MAX(a, b) ((a) < (b) ? (b) : (a)) #define DQN_MIN(a, b) ((a) < (b) ? (a) : (b)) #define DQN_SWAP(type, a, b) do { type tmp = a; a = b; b = tmp; } while(0) + +//////////////////////////////////////////////////////////////////////////////// +// Dqn Error +//////////////////////////////////////////////////////////////////////////////// +#define DQN_ASSERT_HARD(expr) if (!(expr)) { *((int *)0) = 0; } + +#define DQN_ASSERT(expr) DqnAssertInternal(expr, __FILE__, __LINE__, #expr, NULL) +#define DQN_ASSERT_MSG(expr, msg) DqnAssertInternal(expr, __FILE__, __LINE__, #expr, msg) +DQN_FILE_SCOPE bool DqnAssertInternal(const bool result, const char *const file, const i32 lineNum, + const char *const expr, const char *const msg); + + //////////////////////////////////////////////////////////////////////////////// // DqnMem - Memory //////////////////////////////////////////////////////////////////////////////// @@ -445,6 +456,29 @@ bool DqnArray_RemoveStable(DqnArray *array, u64 index) } #endif // DQN_CPP_MODE +//////////////////////////////////////////////////////////////////////////////// +// DqnJobQueue - Multithreaded Job Queue +//////////////////////////////////////////////////////////////////////////////// +typedef void DqnJob_Callback(struct DqnJobQueue *const queue, void *const userData); +typedef struct DqnJob +{ + DqnJob_Callback *callback; + void *userData; +} DqnJob; + +typedef struct DqnJobQueue +{ + DqnJob *volatile jobList; + u32 size; + + // NOTE: Modified by main+worker threads + u32 volatile jobToExecuteIndex; + void *win32Semaphore; + u32 volatile numJobsToComplete; + + // NOTE: Modified by main thread ONLY + u32 volatile jobInsertIndex; +} DqnJobQueue; //////////////////////////////////////////////////////////////////////////////// // Math //////////////////////////////////////////////////////////////////////////////// @@ -817,14 +851,20 @@ DQN_FILE_SCOPE i32 DqnRnd_PCGRange(DqnRandPCGState *pcg, i32 min, i32 max); DQN_FILE_SCOPE bool DqnWin32_UTF8ToWChar (const char *const in, wchar_t *const out, const i32 outLen); DQN_FILE_SCOPE bool DqnWin32_WCharToUTF8 (const wchar_t *const in, char *const out, const i32 outLen); -DQN_FILE_SCOPE void DqnWin32_GetClientDim (const HWND window, LONG *width, LONG *height); -DQN_FILE_SCOPE void DqnWin32_GetRectDim (RECT rect, LONG *width, LONG *height); -DQN_FILE_SCOPE void DqnWin32_DisplayLastError(const char *const errorPrefix); -DQN_FILE_SCOPE void DqnWin32_DisplayErrorCode(const DWORD error, const char *const errorPrefix); +DQN_FILE_SCOPE void DqnWin32_GetClientDim (const HWND window, LONG *width, LONG *height); +DQN_FILE_SCOPE void DqnWin32_GetRectDim (RECT rect, LONG *width, LONG *height); +DQN_FILE_SCOPE void DqnWin32_DisplayLastError (const char *const errorPrefix); +DQN_FILE_SCOPE void DqnWin32_DisplayErrorCode (const DWORD error, const char *const errorPrefix); DQN_FILE_SCOPE void DqnWin32_OutputDebugString(const char *const formatStr, ...); -#endif /* DQN_WIN32_IMPLEMENTATION */ +// buf: Filled with the path to the executable file. +// Returns the offset to the last backslash, -1 if bufLen was not large enough or buf is null. +DQN_FILE_SCOPE i32 DqnWin32_GetEXEDirectory(char *const buf, const u32 bufLen); +// numCores: numThreadsPerCore: Can be NULL, the function will just skip it. +// Uses calloc and free for querying numCores. +DQN_FILE_SCOPE void DqnWin32_GetNumThreadsAndCores(i32 *const numCores, i32 *const numThreadsPerCore); +#endif /* DQN_WIN32_IMPLEMENTATION */ #ifndef DQN_INI_H #define DQN_INI_H @@ -1360,6 +1400,34 @@ STBSP__PUBLICDEF void STB_SPRINTF_DECORATE(set_separators)(char comma, char peri // NOTE: DQN_INI_IMPLEMENTATION modified to be included when DQN_IMPLEMENTATION defined // #define DQN_INI_IMPLEMENTATION #define DQN_INI_STRLEN(s) Dqn_strlen(s) + +//////////////////////////////////////////////////////////////////////////////// +// Dqn Error +//////////////////////////////////////////////////////////////////////////////// +#if (defined(_WIN32) || defined(_WIN64)) && defined(DQN_WIN32_IMPLEMENTATION) +#else +#include +#endif + +DQN_FILE_SCOPE bool DqnAssertInternal(const bool result, const char *const file, const i32 lineNum, + const char *const expr, const char *const msg) +{ + if (!result) + { + const char *const formatStrNoMsg = "DqnAssert() failed: %s|%d| (%s)\n"; + const char *const formatStrWithMsg = "DqnAssert() failed: %s|%d| (%s): %s\n"; + const char *const formatStr = (msg) ? formatStrWithMsg : formatStrNoMsg; +#if (defined(_WIN32) || defined(_WIN64)) && defined(DQN_WIN32_IMPLEMENTATION) + DqnWin32_OutputDebugString(formatStr, file, lineNum, expr, msg); +#else + printf(formatStr, file, lineNum, expr, msg); +#endif + + (*((i32 *)0)) = 0; + } + return result; +} + //////////////////////////////////////////////////////////////////////////////// // DqnMemory - Default Memory Routines //////////////////////////////////////////////////////////////////////////////// @@ -3328,12 +3396,91 @@ DQN_FILE_SCOPE void DqnWin32_OutputDebugString(const char *const formatStr, ...) va_start(argList, formatStr); { i32 numCopied = Dqn_vsprintf(str, formatStr, argList); - DQN_ASSERT(numCopied < DQN_ARRAY_COUNT(str)); + DQN_ASSERT_HARD(numCopied < DQN_ARRAY_COUNT(str)); } va_end(argList); OutputDebugString(str); } + +DQN_FILE_SCOPE i32 DqnWin32_GetEXEDirectory(char *const buf, const u32 bufLen) +{ + if (!buf || bufLen == 0) return 0; + u32 copiedLen = GetModuleFileName(NULL, buf, bufLen); + if (copiedLen == bufLen) return -1; + + // NOTE: Should always work if GetModuleFileName works and we're running an + // executable. + i32 lastSlashIndex = 0; + for (i32 i = copiedLen; i > 0; i--) + { + if (buf[i] == '\\') + { + lastSlashIndex = i; + break; + } + } + + return lastSlashIndex; +} + +DQN_FILE_SCOPE void DqnWin32_GetNumThreadsAndCores(i32 *const numCores, i32 *const numThreadsPerCore) +{ + if (numThreadsPerCore) + { + SYSTEM_INFO systemInfo; + GetNativeSystemInfo(&systemInfo); + *numThreadsPerCore = systemInfo.dwNumberOfProcessors; + } + + if (numCores) + { + *numCores = 0; + DWORD requiredSize = 0; + u8 insufficientBuffer = {0}; + GetLogicalProcessorInformationEx( + RelationProcessorCore, (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)insufficientBuffer, + &requiredSize); + + u8 *rawProcInfoArray = (u8 *)DqnMem_Calloc(requiredSize); + if (!rawProcInfoArray) + { + DQN_WIN32_ERROR_BOX("DqnMem_Calloc() failed", NULL); + DQN_ASSERT(DQN_INVALID_CODE_PATH); + return; + } + + if (GetLogicalProcessorInformationEx( + RelationProcessorCore, (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)rawProcInfoArray, + &requiredSize)) + { + SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *logicalProcInfo = + (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)rawProcInfoArray; + DWORD bytesRead = 0; + + do + { + // NOTE: High efficiency value has greater performance and less efficiency. + PROCESSOR_RELATIONSHIP *procInfo = &logicalProcInfo->Processor; + u32 efficiency = procInfo->EfficiencyClass; + (*numCores)++; + DQN_ASSERT(logicalProcInfo->Relationship == RelationProcessorCore); + DQN_ASSERT(procInfo->GroupCount == 1); + + bytesRead += logicalProcInfo->Size; + logicalProcInfo = + (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((u8 *)logicalProcInfo + + logicalProcInfo->Size); + } while (bytesRead < requiredSize); + } + else + { + DqnWin32_DisplayLastError("GetLogicalProcessorInformationEx() failed"); + } + + DqnMem_Free(rawProcInfoArray); + } +} #endif // DQN_WIN32_PLATFROM FILE_SCOPE bool DqnFile_OpenInternal(const wchar_t *const path,