diff --git a/src/DTRenderer.cpp b/src/DTRenderer.cpp index 405d8cb..c1cc157 100644 --- a/src/DTRenderer.cpp +++ b/src/DTRenderer.cpp @@ -976,6 +976,7 @@ extern "C" void DTR_Update(PlatformRenderBuffer *const platformRenderBuffer, } DTRRenderContext renderContext = {}; + renderContext.multithread = true; renderContext.renderBuffer = &renderBuffer; renderContext.tempStack = &memory->tempStack; renderContext.api = &input->api; diff --git a/src/DTRendererRender.cpp b/src/DTRendererRender.cpp index 02a3628..4a3e072 100644 --- a/src/DTRendererRender.cpp +++ b/src/DTRendererRender.cpp @@ -978,13 +978,9 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context, const DqnV3 p1, const Dqn const DqnV2 uv2SubUv1 = uv2 - uv1; const DqnV2 uv3SubUv1 = uv3 - uv1; -#define INLINE_RASTERISE 1 DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Preamble); - -#if INLINE_RASTERISE const u32 IS_GREATER_MASK = 0xF; const u32 zBufferPitch = renderBuffer->width; -#endif //////////////////////////////////////////////////////////////////////////// // Scan and Render @@ -997,7 +993,6 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context, const DqnV3 p1, const Dqn for (i32 bufferX = min.x; bufferX < max.x; bufferX += NUM_X_PIXELS_TO_SIMD) { -#if INLINE_RASTERISE // Rasterise buffer(X, Y) pixel { __m128 checkArea = signedArea1; @@ -1017,43 +1012,46 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context, const DqnV3 p1, const Dqn ((f32 *)&barycentricZ)[2]; i32 zBufferIndex = posX + (posY * zBufferPitch); - __m128 finalColor = simdColor; - if (!ignoreLight) + if (context.multithread) { - __m128 barycentricA_4x = _mm_set_ps1(((f32 *)&barycentric)[0]); - __m128 barycentricB_4x = _mm_set_ps1(((f32 *)&barycentric)[1]); - __m128 barycentricC_4x = _mm_set_ps1(((f32 *)&barycentric)[2]); - - __m128 barycentricLight1 = _mm_mul_ps(p1Light, barycentricA_4x); - __m128 barycentricLight2 = _mm_mul_ps(p2Light, barycentricB_4x); - __m128 barycentricLight3 = _mm_mul_ps(p3Light, barycentricC_4x); - - __m128 light = _mm_add_ps(barycentricLight3, - _mm_add_ps(barycentricLight1, barycentricLight2)); - - finalColor = _mm_mul_ps(finalColor, light); - ((f32 *)&finalColor)[3] = preserveAlpha; + bool currLockValue; + do + { + currLockValue = (bool)context.api->AtomicCompareSwap( + (u32 *)&renderBuffer->pixelLockTable[zBufferIndex], (u32) true, + (u32) false); + } while (currLockValue != false); } - if (texture) - { - __m128 texSampledColor = SIMDSampleTextureForTriangle( - texture, uv1, uv2SubUv1, uv3SubUv1, barycentric); - finalColor = _mm_mul_ps(texSampledColor, finalColor); - } - -#if 1 - bool currLockValue; - do - { - currLockValue = (bool)context.api->AtomicCompareSwap( - (u32 *)&renderBuffer->pixelLockTable[zBufferIndex], (u32)true, (u32)false); - } while (currLockValue != false); -#endif - if (pixelZDepth > renderBuffer->zBuffer[zBufferIndex]) { + __m128 finalColor = simdColor; renderBuffer->zBuffer[zBufferIndex] = pixelZDepth; + if (!ignoreLight) + { + __m128 barycentricA_4x = _mm_set_ps1(((f32 *)&barycentric)[0]); + __m128 barycentricB_4x = _mm_set_ps1(((f32 *)&barycentric)[1]); + __m128 barycentricC_4x = _mm_set_ps1(((f32 *)&barycentric)[2]); + + __m128 barycentricLight1 = _mm_mul_ps(p1Light, barycentricA_4x); + __m128 barycentricLight2 = _mm_mul_ps(p2Light, barycentricB_4x); + __m128 barycentricLight3 = _mm_mul_ps(p3Light, barycentricC_4x); + + __m128 light = + _mm_add_ps(barycentricLight3, + _mm_add_ps(barycentricLight1, barycentricLight2)); + + finalColor = _mm_mul_ps(finalColor, light); + ((f32 *)&finalColor)[3] = preserveAlpha; + } + + if (texture) + { + __m128 texSampledColor = SIMDSampleTextureForTriangle( + texture, uv1, uv2SubUv1, uv3SubUv1, barycentric); + finalColor = _mm_mul_ps(texSampledColor, finalColor); + } + SIMDSetPixel(context, posX, posY, finalColor, ColorSpace_Linear); } renderBuffer->pixelLockTable[zBufferIndex] = false; @@ -1061,15 +1059,192 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context, const DqnV3 p1, const Dqn } signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX); } + } + signedAreaPixel1 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaY); + // signedAreaPixel2 = _mm_add_ps(signedAreaPixel2, signedAreaPixelDeltaY); + } -#else - SIMDRasteriseTrianglePixel(context, texture, bufferX, bufferY, max.x, uv1, uv2SubUv1, - uv3SubUv1, simdColor, triangleZ, signedArea1, - invSignedAreaParallelogram_4x, preserveAlpha, ignoreLight, - p1Light, p2Light, p3Light); - signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX); - // signedArea2 = _mm_add_ps(signedArea2, signedAreaPixelDeltaX); -#endif + DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Rasterise); + DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle); +} + +FILE_SCOPE void SIMDBetterTriangle(DTRRenderContext context, const DqnV3 p1, const DqnV3 p2, + const DqnV3 p3, const DqnV2 uv1, const DqnV2 uv2, + const DqnV2 uv3, const f32 lightIntensity1, + const f32 lightIntensity2, const f32 lightIntensity3, + const bool ignoreLight, DTRBitmap *const texture, DqnV4 color, + const DqnV2i min, const DqnV2i max) + +{ + DTR_DEBUG_EP_TIMED_FUNCTION(); + DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle); + + DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_Preamble); + + DTRRenderBuffer *const renderBuffer = context.renderBuffer; + //////////////////////////////////////////////////////////////////////////// + // Convert color + //////////////////////////////////////////////////////////////////////////// + __m128 simdColor = _mm_set_ps(color.a, color.b, color.g, color.r); + simdColor = SIMDSRGB1ToLinearSpace(simdColor); + simdColor = SIMDPreMultiplyAlpha1(simdColor); + f32 preserveAlpha = ((f32 *)&simdColor)[3]; + + const __m128 ZERO_4X = _mm_set_ps1(0.0f); + __m128 simdLightIntensity1 = _mm_set_ps1(lightIntensity1); + __m128 simdLightIntensity2 = _mm_set_ps1(lightIntensity2); + __m128 simdLightIntensity3 = _mm_set_ps1(lightIntensity3); + + simdLightIntensity1 = _mm_max_ps(simdLightIntensity1, ZERO_4X); + simdLightIntensity2 = _mm_max_ps(simdLightIntensity2, ZERO_4X); + simdLightIntensity3 = _mm_max_ps(simdLightIntensity3, ZERO_4X); + + __m128 p1Light = _mm_mul_ps(simdColor, simdLightIntensity1); + __m128 p2Light = _mm_mul_ps(simdColor, simdLightIntensity2); + __m128 p3Light = _mm_mul_ps(simdColor, simdLightIntensity3); + + //////////////////////////////////////////////////////////////////////////// + // Setup SIMD data + //////////////////////////////////////////////////////////////////////////// + const u32 NUM_X_PIXELS_TO_SIMD = 1; + const u32 NUM_Y_PIXELS_TO_SIMD = 1; + + // SignedArea: _mm_set_ps(unused, p3, p2, p1) ie 0=p1, 1=p1, 2=p3, 3=unused + __m128 signedAreaPixel1 = _mm_set_ps1(0); + // __m128 signedAreaPixel2 = _mm_set_ps1(0); + + __m128 signedAreaPixelDeltaX = _mm_set_ps1(0); + __m128 signedAreaPixelDeltaY = _mm_set_ps1(0); + __m128 invSignedAreaParallelogram_4x = _mm_set_ps1(0); + + __m128 triangleZ = _mm_set_ps(0, p3.z, p2.z, p1.z); + { + DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_Preamble_SArea); + DTR_DEBUG_EP_TIMED_BLOCK("SIMDTriangle_Preamble_SArea"); + DqnV2 startP = DqnV2_V2i(min); + f32 signedArea1Start = Triangle2TimesSignedArea(p2.xy, p3.xy, startP); + f32 signedArea1DeltaX = p2.y - p3.y; + f32 signedArea1DeltaY = p3.x - p2.x; + + f32 signedArea2Start = Triangle2TimesSignedArea(p3.xy, p1.xy, startP); + f32 signedArea2DeltaX = p3.y - p1.y; + f32 signedArea2DeltaY = p1.x - p3.x; + + f32 signedArea3Start = Triangle2TimesSignedArea(p1.xy, p2.xy, startP); + f32 signedArea3DeltaX = p1.y - p2.y; + f32 signedArea3DeltaY = p2.x - p1.x; + DTR_DEBUG_EP_TIMED_END_BLOCK(); + DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Preamble_SArea); + + DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_Preamble_SIMDStep); + f32 signedAreaParallelogram = signedArea1Start + signedArea2Start + signedArea3Start; + if (signedAreaParallelogram == 0) return; + + f32 invSignedAreaParallelogram = 1.0f / signedAreaParallelogram; + invSignedAreaParallelogram_4x = _mm_set_ps1(invSignedAreaParallelogram); + + // NOTE: Order is important here! + signedAreaPixelDeltaX = _mm_set_ps(0, signedArea3DeltaX, signedArea2DeltaX, signedArea1DeltaX); + signedAreaPixelDeltaY = _mm_set_ps(0, signedArea3DeltaY, signedArea2DeltaY, signedArea1DeltaY); + + signedAreaPixel1 = _mm_set_ps(0, signedArea3Start, signedArea2Start, signedArea1Start); + // signedAreaPixel2 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaX); + + // NOTE: Increase step size to the number of pixels rasterised with SIMD + { + const __m128 STEP_X_4X = _mm_set_ps1((f32)NUM_X_PIXELS_TO_SIMD); + const __m128 STEP_Y_4X = _mm_set_ps1((f32)NUM_Y_PIXELS_TO_SIMD); + + signedAreaPixelDeltaX = _mm_mul_ps(signedAreaPixelDeltaX, STEP_X_4X); + signedAreaPixelDeltaY = _mm_mul_ps(signedAreaPixelDeltaY, STEP_Y_4X); + } + DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Preamble_SIMDStep); + } + + const DqnV2 uv2SubUv1 = uv2 - uv1; + const DqnV2 uv3SubUv1 = uv3 - uv1; + + DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Preamble); + const u32 IS_GREATER_MASK = 0xF; + const u32 zBufferPitch = renderBuffer->width; + + //////////////////////////////////////////////////////////////////////////// + // Scan and Render + //////////////////////////////////////////////////////////////////////////// + DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_Rasterise); + for (i32 bufferY = min.y; bufferY < max.y; bufferY += NUM_Y_PIXELS_TO_SIMD) + { + __m128 signedArea1 = signedAreaPixel1; + // __m128 signedArea2 = signedAreaPixel2; + + for (i32 bufferX = min.x; bufferX < max.x; bufferX += NUM_X_PIXELS_TO_SIMD) + { + // Rasterise buffer(X, Y) pixel + { + __m128 checkArea = signedArea1; + __m128 isGreater = _mm_cmpge_ps(checkArea, ZERO_4X); + i32 isGreaterResult = _mm_movemask_ps(isGreater); + i32 posX = bufferX; + i32 posY = bufferY; + + if ((isGreaterResult & IS_GREATER_MASK) == IS_GREATER_MASK) + { + DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_RasterisePixel); + __m128 barycentric = _mm_mul_ps(checkArea, invSignedAreaParallelogram_4x); + __m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric); + + f32 pixelZDepth = ((f32 *)&barycentricZ)[0] + + ((f32 *)&barycentricZ)[1] + + ((f32 *)&barycentricZ)[2]; + + i32 zBufferIndex = posX + (posY * zBufferPitch); + if (context.multithread) + { + bool currLockValue; + do + { + currLockValue = (bool)context.api->AtomicCompareSwap( + (u32 *)&renderBuffer->pixelLockTable[zBufferIndex], (u32) true, + (u32) false); + } while (currLockValue != false); + } + + if (pixelZDepth > renderBuffer->zBuffer[zBufferIndex]) + { + __m128 finalColor = simdColor; + renderBuffer->zBuffer[zBufferIndex] = pixelZDepth; + if (!ignoreLight) + { + __m128 barycentricA_4x = _mm_set_ps1(((f32 *)&barycentric)[0]); + __m128 barycentricB_4x = _mm_set_ps1(((f32 *)&barycentric)[1]); + __m128 barycentricC_4x = _mm_set_ps1(((f32 *)&barycentric)[2]); + + __m128 barycentricLight1 = _mm_mul_ps(p1Light, barycentricA_4x); + __m128 barycentricLight2 = _mm_mul_ps(p2Light, barycentricB_4x); + __m128 barycentricLight3 = _mm_mul_ps(p3Light, barycentricC_4x); + + __m128 light = + _mm_add_ps(barycentricLight3, + _mm_add_ps(barycentricLight1, barycentricLight2)); + + finalColor = _mm_mul_ps(finalColor, light); + ((f32 *)&finalColor)[3] = preserveAlpha; + } + + if (texture) + { + __m128 texSampledColor = SIMDSampleTextureForTriangle( + texture, uv1, uv2SubUv1, uv3SubUv1, barycentric); + finalColor = _mm_mul_ps(texSampledColor, finalColor); + } + + SIMDSetPixel(context, posX, posY, finalColor, ColorSpace_Linear); + } + renderBuffer->pixelLockTable[zBufferIndex] = false; + DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_RasterisePixel); + } + signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX); + } } signedAreaPixel1 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaY); // signedAreaPixel2 = _mm_add_ps(signedAreaPixel2, signedAreaPixelDeltaY); @@ -1169,13 +1344,24 @@ FILE_SCOPE void SlowTriangle(DTRRenderContext context, const DqnV3 p1, const Dqn f32 barycentricB = signedArea2 * invSignedAreaParallelogram; f32 barycentricC = signedArea3 * invSignedAreaParallelogram; - f32 pixelZDepth = p1.z + (barycentricB * (p2SubP1.z)) + (barycentricC * (p3SubP1.z)); - f32 currZDepth = GetCurrZDepth(context, bufferX, bufferY); - - if (pixelZDepth > currZDepth) + i32 zBufferIndex = bufferX + (bufferY * zBufferPitch); + if (context.multithread) { - SetCurrZDepth(context, bufferX, bufferY, pixelZDepth); - DqnV4 finalColor = color; + bool currLockValue; + do + { + currLockValue = (bool)context.api->AtomicCompareSwap( + (u32 *)&renderBuffer->pixelLockTable[zBufferIndex], (u32) true, + (u32) false); + } while (currLockValue != false); + } + + f32 pixelZDepth = + p1.z + (barycentricB * (p2SubP1.z)) + (barycentricC * (p3SubP1.z)); + if (pixelZDepth > renderBuffer->zBuffer[zBufferIndex]) + { + DqnV4 finalColor = color; + renderBuffer->zBuffer[zBufferIndex] = pixelZDepth; if (!ignoreLight) { @@ -1218,6 +1404,7 @@ FILE_SCOPE void SlowTriangle(DTRRenderContext context, const DqnV3 p1, const Dqn SetPixel(context, bufferX, bufferY, finalColor, ColorSpace_Linear); } + renderBuffer->pixelLockTable[zBufferIndex] = false; DEBUG_SLOW_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_RasterisePixel); } @@ -1429,7 +1616,6 @@ void DTRRender_Mesh(DTRRenderContext context, PlatformJobQueue *const jobQueue, viewPModelViewProjection = DqnMat4_Mul(viewport, modelViewProjection); } - bool RUN_MULTITHREADED = true; for (u32 i = 0; i < mesh->numFaces; i++) { DTRMeshFace face = mesh->faces[i]; @@ -1512,7 +1698,7 @@ void DTRRender_Mesh(DTRRenderContext context, PlatformJobQueue *const jobQueue, lightingInternal.numNormals = 3; bool DEBUG_NO_TEX = false; - if (RUN_MULTITHREADED) + if (context.multithread) { RenderMeshJob *jobData = (RenderMeshJob *)DqnMemStack_Push(tempStack, sizeof(*jobData)); if (jobData) @@ -1575,10 +1761,10 @@ void DTRRender_Mesh(DTRRenderContext context, PlatformJobQueue *const jobQueue, } } - // NOTE(doyle): Complete remaining jobs and wait until all jobs finished - // before leaving function. - if (RUN_MULTITHREADED) + if (context.multithread) { + // NOTE(doyle): Complete remaining jobs and wait until all jobs finished + // before leaving function. while (api->QueueTryExecuteNextJob(jobQueue) || !api->QueueAllJobsComplete(jobQueue)) ; } diff --git a/src/DTRendererRender.h b/src/DTRendererRender.h index cd7fc12..b0f6831 100644 --- a/src/DTRendererRender.h +++ b/src/DTRendererRender.h @@ -82,6 +82,8 @@ typedef struct DTRRenderContext DqnMemStack *tempStack; PlatformAPI *api; PlatformJobQueue *jobQueue; + + bool multithread; } DTRRenderContext; // NOTE: All colors should be in the range of [0->1] where DqnV4 is a struct with 4 floats, rgba diff --git a/src/Win32DTRenderer.cpp b/src/Win32DTRenderer.cpp index 3bc5fb6..3415d8c 100644 --- a/src/Win32DTRenderer.cpp +++ b/src/Win32DTRenderer.cpp @@ -570,55 +570,25 @@ FILE_SCOPE void Win32ProcessMessages(HWND window, PlatformInput *input) } } -// Return the index of the last slash -i32 Win32GetModuleDirectory(char *const buf, const u32 bufLen) -{ - if (!buf || bufLen == 0) return 0; - u32 copiedLen = GetModuleFileName(NULL, buf, bufLen); - if (copiedLen == bufLen) - { - DQN_WIN32_ERROR_BOX( - "GetModuleFileName() buffer maxed: Len of copied text is len " - "of supplied buffer.", - NULL); - DQN_ASSERT(DQN_INVALID_CODE_PATH); - } - - // NOTE: Should always work if GetModuleFileName works and we're running an - // executable. - i32 lastSlashIndex = 0; - for (i32 i = copiedLen; i > 0; i--) - { - if (buf[i] == '\\') - { - lastSlashIndex = i; - break; - } - } - - return lastSlashIndex; -} - int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPWSTR lpCmdLine, int nShowCmd) { //////////////////////////////////////////////////////////////////////////// // Initialise Win32 Window //////////////////////////////////////////////////////////////////////////// - WNDCLASSEXW wc = - { - sizeof(WNDCLASSEX), - CS_HREDRAW | CS_VREDRAW | CS_OWNDC, - Win32MainProcCallback, - 0, // int cbClsExtra - 0, // int cbWndExtra - hInstance, - LoadIcon(NULL, IDI_APPLICATION), - LoadCursor(NULL, IDC_ARROW), - GetSysColorBrush(COLOR_3DFACE), - L"", // LPCTSTR lpszMenuName - L"DRendererClass", - NULL, // HICON hIconSm + WNDCLASSEXW wc = { + sizeof(WNDCLASSEX), + CS_HREDRAW | CS_VREDRAW | CS_OWNDC, + Win32MainProcCallback, + 0, // int cbClsExtra + 0, // int cbWndExtra + hInstance, + LoadIcon(NULL, IDI_APPLICATION), + LoadCursor(NULL, IDC_ARROW), + GetSysColorBrush(COLOR_3DFACE), + L"", // LPCTSTR lpszMenuName + L"DRendererClass", + NULL, // HICON hIconSm }; if (!RegisterClassExW(&wc)) @@ -665,7 +635,7 @@ int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPWSTR lpCmdLi globalRenderBitmap.width = header.biWidth; globalRenderBitmap.height = header.biHeight; globalRenderBitmap.bytesPerPixel = header.biBitCount / 8; - DQN_ASSERT(globalRenderBitmap.bytesPerPixel >= 1); + if (!DQN_ASSERT(globalRenderBitmap.bytesPerPixel >= 1)) return -1; HDC deviceContext = GetDC(mainWindow); globalRenderBitmap.handle = CreateDIBSection( @@ -688,17 +658,22 @@ int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPWSTR lpCmdLi char dllTmpPath[MAX_PATH] = {}; { char exeDir[MAX_PATH] = {}; - i32 lastSlashIndex = - Win32GetModuleDirectory(exeDir, DQN_ARRAY_COUNT(exeDir)); - DQN_ASSERT(lastSlashIndex + 1 < DQN_ARRAY_COUNT(exeDir)); + i32 lastSlashIndex = DqnWin32_GetEXEDirectory(exeDir, DQN_ARRAY_COUNT(exeDir)); + if (lastSlashIndex != -1) + { + DQN_ASSERT(lastSlashIndex + 1 < DQN_ARRAY_COUNT(exeDir)); - exeDir[lastSlashIndex + 1] = 0; - u32 numCopied = Dqn_sprintf(dllPath, "%s%s", exeDir, DLL_NAME); - DQN_ASSERT(numCopied < DQN_ARRAY_COUNT(dllPath)); + exeDir[lastSlashIndex + 1] = 0; + u32 numCopied = Dqn_sprintf(dllPath, "%s%s", exeDir, DLL_NAME); + DQN_ASSERT(numCopied < DQN_ARRAY_COUNT(dllPath)); - numCopied = - Dqn_sprintf(dllTmpPath, "%s%s", exeDir, DLL_TMP_NAME); - DQN_ASSERT(numCopied < DQN_ARRAY_COUNT(dllTmpPath)); + numCopied = Dqn_sprintf(dllTmpPath, "%s%s", exeDir, DLL_TMP_NAME); + DQN_ASSERT(numCopied < DQN_ARRAY_COUNT(dllTmpPath)); + } + else + { + DQN_ASSERT(DQN_INVALID_CODE_PATH); + } } //////////////////////////////////////////////////////////////////////////// @@ -748,62 +723,8 @@ int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPWSTR lpCmdLi //////////////////////////////////////////////////////////////////////// // Query CPU Cores //////////////////////////////////////////////////////////////////////// - i32 numCores = 0; - i32 numLogicalCores = 0; - - SYSTEM_INFO systemInfo; - GetNativeSystemInfo(&systemInfo); - DqnWin32_OutputDebugString("Number of Logical Processors: %d\n", - systemInfo.dwNumberOfProcessors); - numLogicalCores = systemInfo.dwNumberOfProcessors; - - DWORD logicalProcInfoRequiredSize = 0; - u8 insufficientBuffer = {}; - GetLogicalProcessorInformationEx( - RelationProcessorCore, (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)insufficientBuffer, - &logicalProcInfoRequiredSize); - - u8 *rawProcInfoArray = - (u8 *)DqnMemStack_Push(&globalPlatformMemory.tempStack, logicalProcInfoRequiredSize); - - if (rawProcInfoArray) - { - if (GetLogicalProcessorInformationEx( - RelationProcessorCore, - (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)rawProcInfoArray, - &logicalProcInfoRequiredSize)) - { - SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *logicalProcInfo = - (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)rawProcInfoArray; - DWORD bytesRead = 0; - - do - { - // NOTE: High efficiency value has greater performance and less efficiency. - PROCESSOR_RELATIONSHIP *procInfo = &logicalProcInfo->Processor; - u32 efficiency = procInfo->EfficiencyClass; - DqnWin32_OutputDebugString("Core %d: Efficiency: %d\n", numCores++, efficiency); - - DQN_ASSERT(logicalProcInfo->Relationship == RelationProcessorCore); - DQN_ASSERT(procInfo->GroupCount == 1); - - bytesRead += logicalProcInfo->Size; - logicalProcInfo = - (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((u8 *)logicalProcInfo + - logicalProcInfo->Size); - - } while (bytesRead < logicalProcInfoRequiredSize); - } - else - { - DqnWin32_DisplayLastError("GetLogicalProcessorInformationEx() failed"); - } - } - else - { - DQN_WIN32_ERROR_BOX("DqnMemStack_Push() failed", NULL); - } - DqnMemStackTempRegion_End(memRegion); + i32 numCores, numThreadsPerCore; + DqnWin32_GetNumThreadsAndCores(&numCores, &numThreadsPerCore); //////////////////////////////////////////////////////////////////////// // Threading @@ -815,11 +736,10 @@ int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPWSTR lpCmdLi DQN_ASSERT(((size_t)&jobQueue.jobToExecuteIndex) % 4 == 0); // NOTE: (numCores - 1), 1 core is already exclusively for main thread - i32 availableThreads = (numCores - 1) * numLogicalCores; - + i32 availableThreads = (numCores - 1) * numThreadsPerCore; // TODO(doyle): Logic for single core/thread processors DQN_ASSERT(availableThreads > 0); - + jobQueue.win32Semaphore = CreateSemaphore(NULL, 0, availableThreads, NULL); if (jobQueue.win32Semaphore) { diff --git a/src/build.bat b/src/build.bat index c30124b..69b8174 100644 --- a/src/build.bat +++ b/src/build.bat @@ -78,6 +78,7 @@ set TimeStamp=%date:~10,4%%date:~7,2%%date:~4,2%_%CleanTime:~0,2%%CleanTime:~3,2 del *.pdb >NUL 2>NUL cl %CompileFlags% %Win32Flags% ..\src\Win32DTRenderer.cpp /link %LinkLibraries% %LinkFlags% +REM cl /P ..\src\Win32DTRenderer.cpp REM cl %CompileFlags% %DLLFlags% ..\src\UnityBuild\UnityBuild.cpp /LD /link ..\src\external\easy\easy_profiler.lib /PDB:%ProjectName%_%TimeStamp%.pdb /export:DTR_Update %LinkFlags% cl %CompileFlags% %DLLFlags% ..\src\UnityBuild\UnityBuild.cpp /LD /link /PDB:%ProjectName%_%TimeStamp%.pdb /export:DTR_Update %LinkFlags% diff --git a/src/dqn.h b/src/dqn.h index 4a70e7f..b0e3bec 100644 --- a/src/dqn.h +++ b/src/dqn.h @@ -52,7 +52,6 @@ typedef float f32; #define DQN_INVALID_CODE_PATH 0 #define DQN_ARRAY_COUNT(array) (sizeof(array) / sizeof(array[0])) -#define DQN_ASSERT(expr) if (!(expr)) { (*((i32 *)0)) = 0; } #define DQN_PI 3.14159265359f #define DQN_SQUARED(x) ((x) * (x)) @@ -63,6 +62,18 @@ typedef float f32; #define DQN_MAX(a, b) ((a) < (b) ? (b) : (a)) #define DQN_MIN(a, b) ((a) < (b) ? (a) : (b)) #define DQN_SWAP(type, a, b) do { type tmp = a; a = b; b = tmp; } while(0) + +//////////////////////////////////////////////////////////////////////////////// +// Dqn Error +//////////////////////////////////////////////////////////////////////////////// +#define DQN_ASSERT_HARD(expr) if (!(expr)) { *((int *)0) = 0; } + +#define DQN_ASSERT(expr) DqnAssertInternal(expr, __FILE__, __LINE__, #expr, NULL) +#define DQN_ASSERT_MSG(expr, msg) DqnAssertInternal(expr, __FILE__, __LINE__, #expr, msg) +DQN_FILE_SCOPE bool DqnAssertInternal(const bool result, const char *const file, const i32 lineNum, + const char *const expr, const char *const msg); + + //////////////////////////////////////////////////////////////////////////////// // DqnMem - Memory //////////////////////////////////////////////////////////////////////////////// @@ -445,6 +456,29 @@ bool DqnArray_RemoveStable(DqnArray *array, u64 index) } #endif // DQN_CPP_MODE +//////////////////////////////////////////////////////////////////////////////// +// DqnJobQueue - Multithreaded Job Queue +//////////////////////////////////////////////////////////////////////////////// +typedef void DqnJob_Callback(struct DqnJobQueue *const queue, void *const userData); +typedef struct DqnJob +{ + DqnJob_Callback *callback; + void *userData; +} DqnJob; + +typedef struct DqnJobQueue +{ + DqnJob *volatile jobList; + u32 size; + + // NOTE: Modified by main+worker threads + u32 volatile jobToExecuteIndex; + void *win32Semaphore; + u32 volatile numJobsToComplete; + + // NOTE: Modified by main thread ONLY + u32 volatile jobInsertIndex; +} DqnJobQueue; //////////////////////////////////////////////////////////////////////////////// // Math //////////////////////////////////////////////////////////////////////////////// @@ -817,14 +851,20 @@ DQN_FILE_SCOPE i32 DqnRnd_PCGRange(DqnRandPCGState *pcg, i32 min, i32 max); DQN_FILE_SCOPE bool DqnWin32_UTF8ToWChar (const char *const in, wchar_t *const out, const i32 outLen); DQN_FILE_SCOPE bool DqnWin32_WCharToUTF8 (const wchar_t *const in, char *const out, const i32 outLen); -DQN_FILE_SCOPE void DqnWin32_GetClientDim (const HWND window, LONG *width, LONG *height); -DQN_FILE_SCOPE void DqnWin32_GetRectDim (RECT rect, LONG *width, LONG *height); -DQN_FILE_SCOPE void DqnWin32_DisplayLastError(const char *const errorPrefix); -DQN_FILE_SCOPE void DqnWin32_DisplayErrorCode(const DWORD error, const char *const errorPrefix); +DQN_FILE_SCOPE void DqnWin32_GetClientDim (const HWND window, LONG *width, LONG *height); +DQN_FILE_SCOPE void DqnWin32_GetRectDim (RECT rect, LONG *width, LONG *height); +DQN_FILE_SCOPE void DqnWin32_DisplayLastError (const char *const errorPrefix); +DQN_FILE_SCOPE void DqnWin32_DisplayErrorCode (const DWORD error, const char *const errorPrefix); DQN_FILE_SCOPE void DqnWin32_OutputDebugString(const char *const formatStr, ...); -#endif /* DQN_WIN32_IMPLEMENTATION */ +// buf: Filled with the path to the executable file. +// Returns the offset to the last backslash, -1 if bufLen was not large enough or buf is null. +DQN_FILE_SCOPE i32 DqnWin32_GetEXEDirectory(char *const buf, const u32 bufLen); +// numCores: numThreadsPerCore: Can be NULL, the function will just skip it. +// Uses calloc and free for querying numCores. +DQN_FILE_SCOPE void DqnWin32_GetNumThreadsAndCores(i32 *const numCores, i32 *const numThreadsPerCore); +#endif /* DQN_WIN32_IMPLEMENTATION */ #ifndef DQN_INI_H #define DQN_INI_H @@ -1360,6 +1400,34 @@ STBSP__PUBLICDEF void STB_SPRINTF_DECORATE(set_separators)(char comma, char peri // NOTE: DQN_INI_IMPLEMENTATION modified to be included when DQN_IMPLEMENTATION defined // #define DQN_INI_IMPLEMENTATION #define DQN_INI_STRLEN(s) Dqn_strlen(s) + +//////////////////////////////////////////////////////////////////////////////// +// Dqn Error +//////////////////////////////////////////////////////////////////////////////// +#if (defined(_WIN32) || defined(_WIN64)) && defined(DQN_WIN32_IMPLEMENTATION) +#else +#include +#endif + +DQN_FILE_SCOPE bool DqnAssertInternal(const bool result, const char *const file, const i32 lineNum, + const char *const expr, const char *const msg) +{ + if (!result) + { + const char *const formatStrNoMsg = "DqnAssert() failed: %s|%d| (%s)\n"; + const char *const formatStrWithMsg = "DqnAssert() failed: %s|%d| (%s): %s\n"; + const char *const formatStr = (msg) ? formatStrWithMsg : formatStrNoMsg; +#if (defined(_WIN32) || defined(_WIN64)) && defined(DQN_WIN32_IMPLEMENTATION) + DqnWin32_OutputDebugString(formatStr, file, lineNum, expr, msg); +#else + printf(formatStr, file, lineNum, expr, msg); +#endif + + (*((i32 *)0)) = 0; + } + return result; +} + //////////////////////////////////////////////////////////////////////////////// // DqnMemory - Default Memory Routines //////////////////////////////////////////////////////////////////////////////// @@ -3328,12 +3396,91 @@ DQN_FILE_SCOPE void DqnWin32_OutputDebugString(const char *const formatStr, ...) va_start(argList, formatStr); { i32 numCopied = Dqn_vsprintf(str, formatStr, argList); - DQN_ASSERT(numCopied < DQN_ARRAY_COUNT(str)); + DQN_ASSERT_HARD(numCopied < DQN_ARRAY_COUNT(str)); } va_end(argList); OutputDebugString(str); } + +DQN_FILE_SCOPE i32 DqnWin32_GetEXEDirectory(char *const buf, const u32 bufLen) +{ + if (!buf || bufLen == 0) return 0; + u32 copiedLen = GetModuleFileName(NULL, buf, bufLen); + if (copiedLen == bufLen) return -1; + + // NOTE: Should always work if GetModuleFileName works and we're running an + // executable. + i32 lastSlashIndex = 0; + for (i32 i = copiedLen; i > 0; i--) + { + if (buf[i] == '\\') + { + lastSlashIndex = i; + break; + } + } + + return lastSlashIndex; +} + +DQN_FILE_SCOPE void DqnWin32_GetNumThreadsAndCores(i32 *const numCores, i32 *const numThreadsPerCore) +{ + if (numThreadsPerCore) + { + SYSTEM_INFO systemInfo; + GetNativeSystemInfo(&systemInfo); + *numThreadsPerCore = systemInfo.dwNumberOfProcessors; + } + + if (numCores) + { + *numCores = 0; + DWORD requiredSize = 0; + u8 insufficientBuffer = {0}; + GetLogicalProcessorInformationEx( + RelationProcessorCore, (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)insufficientBuffer, + &requiredSize); + + u8 *rawProcInfoArray = (u8 *)DqnMem_Calloc(requiredSize); + if (!rawProcInfoArray) + { + DQN_WIN32_ERROR_BOX("DqnMem_Calloc() failed", NULL); + DQN_ASSERT(DQN_INVALID_CODE_PATH); + return; + } + + if (GetLogicalProcessorInformationEx( + RelationProcessorCore, (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)rawProcInfoArray, + &requiredSize)) + { + SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *logicalProcInfo = + (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)rawProcInfoArray; + DWORD bytesRead = 0; + + do + { + // NOTE: High efficiency value has greater performance and less efficiency. + PROCESSOR_RELATIONSHIP *procInfo = &logicalProcInfo->Processor; + u32 efficiency = procInfo->EfficiencyClass; + (*numCores)++; + DQN_ASSERT(logicalProcInfo->Relationship == RelationProcessorCore); + DQN_ASSERT(procInfo->GroupCount == 1); + + bytesRead += logicalProcInfo->Size; + logicalProcInfo = + (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((u8 *)logicalProcInfo + + logicalProcInfo->Size); + } while (bytesRead < requiredSize); + } + else + { + DqnWin32_DisplayLastError("GetLogicalProcessorInformationEx() failed"); + } + + DqnMem_Free(rawProcInfoArray); + } +} #endif // DQN_WIN32_PLATFROM FILE_SCOPE bool DqnFile_OpenInternal(const wchar_t *const path,