Lock pixels for multithread more granularly level
This commit is contained in:
parent
e660281211
commit
e860145e77
@ -966,13 +966,20 @@ extern "C" void DTR_Update(PlatformRenderBuffer *const platformRenderBuffer,
|
|||||||
renderBuffer.zBuffer = (f32 *)DqnMemStack_Push(
|
renderBuffer.zBuffer = (f32 *)DqnMemStack_Push(
|
||||||
&memory->tempStack, zBufferSize * sizeof(*renderBuffer.zBuffer));
|
&memory->tempStack, zBufferSize * sizeof(*renderBuffer.zBuffer));
|
||||||
|
|
||||||
for (u32 i = 0; i < zBufferSize; i++)
|
renderBuffer.pixelLockTable = (bool *)DqnMemStack_Push(
|
||||||
renderBuffer.zBuffer[i] = DQN_F32_MIN;
|
&memory->tempStack, zBufferSize * sizeof(*renderBuffer.pixelLockTable));
|
||||||
|
|
||||||
|
for (u32 i = 0; i < zBufferSize; i++)
|
||||||
|
{
|
||||||
|
renderBuffer.zBuffer[i] = DQN_F32_MIN;
|
||||||
|
renderBuffer.pixelLockTable[i] = false;
|
||||||
|
}
|
||||||
|
|
||||||
DTRRenderContext renderContext = {};
|
DTRRenderContext renderContext = {};
|
||||||
renderContext.renderBuffer = &renderBuffer;
|
renderContext.renderBuffer = &renderBuffer;
|
||||||
renderContext.tempStack = &memory->tempStack;
|
renderContext.tempStack = &memory->tempStack;
|
||||||
renderContext.api = &input->api;
|
renderContext.api = &input->api;
|
||||||
|
renderContext.jobQueue = input->jobQueue;
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
// Update and Render
|
// Update and Render
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
@ -1086,6 +1093,10 @@ extern "C" void DTR_Update(PlatformRenderBuffer *const platformRenderBuffer,
|
|||||||
DTRDebug_EndCycleCount(DTRDebugCycleCount_DTR_Update);
|
DTRDebug_EndCycleCount(DTRDebugCycleCount_DTR_Update);
|
||||||
DTRDebug_Update(state, renderContext, input, memory);
|
DTRDebug_Update(state, renderContext, input, memory);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
while (input->api.QueueTryExecuteNextJob(input->jobQueue) ||
|
||||||
|
!input->api.QueueAllJobsComplete(input->jobQueue))
|
||||||
|
;
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -53,6 +53,8 @@ typedef bool PlatformAPI_QueueAddJob (PlatformJobQueue *const queue, c
|
|||||||
typedef bool PlatformAPI_QueueTryExecuteNextJob(PlatformJobQueue *const queue);
|
typedef bool PlatformAPI_QueueTryExecuteNextJob(PlatformJobQueue *const queue);
|
||||||
typedef bool PlatformAPI_QueueAllJobsComplete (PlatformJobQueue *const queue);
|
typedef bool PlatformAPI_QueueAllJobsComplete (PlatformJobQueue *const queue);
|
||||||
|
|
||||||
|
typedef u32 PlatformAPI_AtomicCompareSwap(u32 *volatile dest, u32 swapVal, u32 compareVal);
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Platform Locks
|
// Platform Locks
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -77,6 +79,7 @@ typedef struct PlatformAPI
|
|||||||
PlatformAPI_QueueAddJob *QueueAddJob;
|
PlatformAPI_QueueAddJob *QueueAddJob;
|
||||||
PlatformAPI_QueueTryExecuteNextJob *QueueTryExecuteNextJob;
|
PlatformAPI_QueueTryExecuteNextJob *QueueTryExecuteNextJob;
|
||||||
PlatformAPI_QueueAllJobsComplete *QueueAllJobsComplete;
|
PlatformAPI_QueueAllJobsComplete *QueueAllJobsComplete;
|
||||||
|
PlatformAPI_AtomicCompareSwap *AtomicCompareSwap;
|
||||||
|
|
||||||
PlatformAPI_LockInit *LockInit;
|
PlatformAPI_LockInit *LockInit;
|
||||||
PlatformAPI_LockAcquire *LockAcquire;
|
PlatformAPI_LockAcquire *LockAcquire;
|
||||||
|
@ -643,8 +643,9 @@ FILE_SCOPE inline void SIMDSetPixel(DTRRenderContext context, const i32 x, const
|
|||||||
const u32 pitchInU32 = (renderBuffer->width * renderBuffer->bytesPerPixel) / 4;
|
const u32 pitchInU32 = (renderBuffer->width * renderBuffer->bytesPerPixel) / 4;
|
||||||
|
|
||||||
u32 srcPixel = bitmapPtr[x + (y * pitchInU32)];
|
u32 srcPixel = bitmapPtr[x + (y * pitchInU32)];
|
||||||
__m128 src = _mm_set_ps(0, (f32)((srcPixel >> 0) & 0xFF), (f32)((srcPixel >> 8) & 0xFF),
|
__m128 src = _mm_set_ps(0, (f32)((srcPixel >> 0) & 0xFF),
|
||||||
(f32)((srcPixel >> 16) & 0xFF));
|
(f32)((srcPixel >> 8) & 0xFF),
|
||||||
|
(f32)((srcPixel >> 16) & 0xFF));
|
||||||
src = SIMDSRGB255ToLinearSpace1(src);
|
src = SIMDSRGB255ToLinearSpace1(src);
|
||||||
|
|
||||||
f32 invA = 1 - alpha;
|
f32 invA = 1 - alpha;
|
||||||
@ -663,7 +664,9 @@ FILE_SCOPE inline void SIMDSetPixel(DTRRenderContext context, const i32 x, const
|
|||||||
f32 destB = ((f32 *)&dest)[2];
|
f32 destB = ((f32 *)&dest)[2];
|
||||||
|
|
||||||
u32 pixel = // ((u32)(destA) << 24 |
|
u32 pixel = // ((u32)(destA) << 24 |
|
||||||
(u32)(destR) << 16 | (u32)(destG) << 8 | (u32)(destB) << 0;
|
(u32)(destR) << 16 |
|
||||||
|
(u32)(destG) << 8 |
|
||||||
|
(u32)(destB) << 0;
|
||||||
bitmapPtr[x + (y * pitchInU32)] = pixel;
|
bitmapPtr[x + (y * pitchInU32)] = pixel;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -814,21 +817,18 @@ FILE_SCOPE inline void SetCurrZDepth(DTRRenderContext context, i32 posX, i32 pos
|
|||||||
DTRDebug_EndCycleCount(DTRDebugCycleCount_SIMD##type); \
|
DTRDebug_EndCycleCount(DTRDebugCycleCount_SIMD##type); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
FILE_SCOPE void SIMDRasteriseTrianglePixel(DTRRenderContext context,
|
FILE_SCOPE void
|
||||||
const DTRBitmap *const texture, const i32 posX,
|
SIMDRasteriseTrianglePixel(DTRRenderContext context, const DTRBitmap *const texture, const i32 posX,
|
||||||
const i32 posY, const i32 maxX, const DqnV2 uv1,
|
const i32 posY, const i32 maxX, const DqnV2 uv1, const DqnV2 uv2SubUv1,
|
||||||
const DqnV2 uv2SubUv1, const DqnV2 uv3SubUv1,
|
const DqnV2 uv3SubUv1, const __m128 simdColor, const __m128 triangleZ,
|
||||||
const __m128 simdColor, const __m128 triangleZ,
|
const __m128 signedArea, const __m128 invSignedAreaParallelogram_4x,
|
||||||
const __m128 signedArea,
|
const f32 preserveAlpha, const bool ignoreLight, const __m128 p1Light,
|
||||||
const __m128 invSignedAreaParallelogram_4x)
|
const __m128 p2Light, const __m128 p3Light)
|
||||||
{
|
{
|
||||||
const __m128 ZERO_4X = _mm_set_ps1(0.0f);
|
DTRRenderBuffer *const renderBuffer = context.renderBuffer;
|
||||||
const u32 IS_GREATER_MASK = 0xF;
|
const __m128 ZERO_4X = _mm_set_ps1(0.0f);
|
||||||
|
const u32 IS_GREATER_MASK = 0xF;
|
||||||
DTRRenderBuffer *renderBuffer = context.renderBuffer;
|
const u32 zBufferPitch = renderBuffer->width;
|
||||||
|
|
||||||
// TODO(doyle): Copy lighting work over. But not important since using this
|
|
||||||
// function causes performance problems.
|
|
||||||
|
|
||||||
// Rasterise buffer(X, Y) pixel
|
// Rasterise buffer(X, Y) pixel
|
||||||
{
|
{
|
||||||
@ -838,36 +838,56 @@ FILE_SCOPE void SIMDRasteriseTrianglePixel(DTRRenderContext context,
|
|||||||
if ((isGreaterResult & IS_GREATER_MASK) == IS_GREATER_MASK && posX < maxX)
|
if ((isGreaterResult & IS_GREATER_MASK) == IS_GREATER_MASK && posX < maxX)
|
||||||
{
|
{
|
||||||
DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_RasterisePixel);
|
DEBUG_SIMD_AUTO_CHOOSE_BEGIN_CYCLE_COUNT(Triangle_RasterisePixel);
|
||||||
__m128 barycentric = _mm_mul_ps(signedArea, invSignedAreaParallelogram_4x);
|
|
||||||
__m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric);
|
|
||||||
|
|
||||||
f32 pixelZDepth =
|
|
||||||
((f32 *)&barycentricZ)[0] + ((f32 *)&barycentricZ)[1] + ((f32 *)&barycentricZ)[2];
|
|
||||||
f32 currZDepth = GetCurrZDepth(context, posX, posY);
|
|
||||||
if (pixelZDepth > currZDepth)
|
|
||||||
{
|
{
|
||||||
SetCurrZDepth(context, posX, posY, pixelZDepth);
|
__m128 barycentric = _mm_mul_ps(signedArea, invSignedAreaParallelogram_4x);
|
||||||
|
__m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric);
|
||||||
|
|
||||||
__m128 finalColor = simdColor;
|
f32 pixelZDepth = ((f32 *)&barycentricZ)[0] + ((f32 *)&barycentricZ)[1] +
|
||||||
if (texture)
|
((f32 *)&barycentricZ)[2];
|
||||||
|
|
||||||
|
i32 zBufferIndex = posX + (posY * zBufferPitch);
|
||||||
|
if (pixelZDepth > renderBuffer->zBuffer[zBufferIndex])
|
||||||
{
|
{
|
||||||
__m128 texSampledColor = SIMDSampleTextureForTriangle(texture, uv1, uv2SubUv1,
|
renderBuffer->zBuffer[zBufferIndex] = pixelZDepth;
|
||||||
uv3SubUv1, barycentric);
|
__m128 finalColor = simdColor;
|
||||||
finalColor = _mm_mul_ps(texSampledColor, simdColor);
|
if (!ignoreLight)
|
||||||
|
{
|
||||||
|
__m128 barycentricA_4x = _mm_set_ps1(((f32 *)&barycentric)[0]);
|
||||||
|
__m128 barycentricB_4x = _mm_set_ps1(((f32 *)&barycentric)[1]);
|
||||||
|
__m128 barycentricC_4x = _mm_set_ps1(((f32 *)&barycentric)[2]);
|
||||||
|
|
||||||
|
__m128 barycentricLight1 = _mm_mul_ps(p1Light, barycentricA_4x);
|
||||||
|
__m128 barycentricLight2 = _mm_mul_ps(p2Light, barycentricB_4x);
|
||||||
|
__m128 barycentricLight3 = _mm_mul_ps(p3Light, barycentricC_4x);
|
||||||
|
|
||||||
|
__m128 light = _mm_add_ps(barycentricLight3,
|
||||||
|
_mm_add_ps(barycentricLight1, barycentricLight2));
|
||||||
|
|
||||||
|
finalColor = _mm_mul_ps(finalColor, light);
|
||||||
|
((f32 *)&finalColor)[3] = preserveAlpha;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (texture)
|
||||||
|
{
|
||||||
|
__m128 texSampledColor = SIMDSampleTextureForTriangle(
|
||||||
|
texture, uv1, uv2SubUv1, uv3SubUv1, barycentric);
|
||||||
|
finalColor = _mm_mul_ps(texSampledColor, finalColor);
|
||||||
|
}
|
||||||
|
|
||||||
|
SIMDSetPixel(context, posX, posY, finalColor, ColorSpace_Linear);
|
||||||
}
|
}
|
||||||
SIMDSetPixel(context, posX, posY, finalColor, ColorSpace_Linear);
|
|
||||||
}
|
}
|
||||||
DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_RasterisePixel);
|
DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_RasterisePixel);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
FILE_SCOPE void SIMDTriangle(DTRRenderContext context,
|
FILE_SCOPE void SIMDTriangle(DTRRenderContext context, const DqnV3 p1, const DqnV3 p2,
|
||||||
const DqnV3 p1, const DqnV3 p2, const DqnV3 p3, const DqnV2 uv1,
|
const DqnV3 p3, const DqnV2 uv1, const DqnV2 uv2, const DqnV2 uv3,
|
||||||
const DqnV2 uv2, const DqnV2 uv3, const f32 lightIntensity1,
|
const f32 lightIntensity1, const f32 lightIntensity2,
|
||||||
const f32 lightIntensity2, const f32 lightIntensity3,
|
const f32 lightIntensity3, const bool ignoreLight,
|
||||||
const bool ignoreLight, DTRBitmap *const texture, DqnV4 color,
|
DTRBitmap *const texture, DqnV4 color, const DqnV2i min,
|
||||||
const DqnV2i min, const DqnV2i max)
|
const DqnV2i max)
|
||||||
|
|
||||||
{
|
{
|
||||||
DTR_DEBUG_EP_TIMED_FUNCTION();
|
DTR_DEBUG_EP_TIMED_FUNCTION();
|
||||||
@ -900,12 +920,12 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context,
|
|||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
// Setup SIMD data
|
// Setup SIMD data
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
const u32 NUM_X_PIXELS_TO_SIMD = 2;
|
const u32 NUM_X_PIXELS_TO_SIMD = 1;
|
||||||
const u32 NUM_Y_PIXELS_TO_SIMD = 1;
|
const u32 NUM_Y_PIXELS_TO_SIMD = 1;
|
||||||
|
|
||||||
// SignedArea: _mm_set_ps(unused, p3, p2, p1) ie 0=p1, 1=p1, 2=p3, 3=unused
|
// SignedArea: _mm_set_ps(unused, p3, p2, p1) ie 0=p1, 1=p1, 2=p3, 3=unused
|
||||||
__m128 signedAreaPixel1 = _mm_set_ps1(0);
|
__m128 signedAreaPixel1 = _mm_set_ps1(0);
|
||||||
__m128 signedAreaPixel2 = _mm_set_ps1(0);
|
// __m128 signedAreaPixel2 = _mm_set_ps1(0);
|
||||||
|
|
||||||
__m128 signedAreaPixelDeltaX = _mm_set_ps1(0);
|
__m128 signedAreaPixelDeltaX = _mm_set_ps1(0);
|
||||||
__m128 signedAreaPixelDeltaY = _mm_set_ps1(0);
|
__m128 signedAreaPixelDeltaY = _mm_set_ps1(0);
|
||||||
@ -942,7 +962,7 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context,
|
|||||||
signedAreaPixelDeltaY = _mm_set_ps(0, signedArea3DeltaY, signedArea2DeltaY, signedArea1DeltaY);
|
signedAreaPixelDeltaY = _mm_set_ps(0, signedArea3DeltaY, signedArea2DeltaY, signedArea1DeltaY);
|
||||||
|
|
||||||
signedAreaPixel1 = _mm_set_ps(0, signedArea3Start, signedArea2Start, signedArea1Start);
|
signedAreaPixel1 = _mm_set_ps(0, signedArea3Start, signedArea2Start, signedArea1Start);
|
||||||
signedAreaPixel2 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaX);
|
// signedAreaPixel2 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaX);
|
||||||
|
|
||||||
// NOTE: Increase step size to the number of pixels rasterised with SIMD
|
// NOTE: Increase step size to the number of pixels rasterised with SIMD
|
||||||
{
|
{
|
||||||
@ -958,10 +978,10 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context,
|
|||||||
const DqnV2 uv2SubUv1 = uv2 - uv1;
|
const DqnV2 uv2SubUv1 = uv2 - uv1;
|
||||||
const DqnV2 uv3SubUv1 = uv3 - uv1;
|
const DqnV2 uv3SubUv1 = uv3 - uv1;
|
||||||
|
|
||||||
#define UNROLL_LOOP 1
|
#define INLINE_RASTERISE 1
|
||||||
DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Preamble);
|
DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Preamble);
|
||||||
|
|
||||||
#if UNROLL_LOOP
|
#if INLINE_RASTERISE
|
||||||
const u32 IS_GREATER_MASK = 0xF;
|
const u32 IS_GREATER_MASK = 0xF;
|
||||||
const u32 zBufferPitch = renderBuffer->width;
|
const u32 zBufferPitch = renderBuffer->width;
|
||||||
#endif
|
#endif
|
||||||
@ -973,11 +993,11 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context,
|
|||||||
for (i32 bufferY = min.y; bufferY < max.y; bufferY += NUM_Y_PIXELS_TO_SIMD)
|
for (i32 bufferY = min.y; bufferY < max.y; bufferY += NUM_Y_PIXELS_TO_SIMD)
|
||||||
{
|
{
|
||||||
__m128 signedArea1 = signedAreaPixel1;
|
__m128 signedArea1 = signedAreaPixel1;
|
||||||
__m128 signedArea2 = signedAreaPixel2;
|
// __m128 signedArea2 = signedAreaPixel2;
|
||||||
|
|
||||||
for (i32 bufferX = min.x; bufferX < max.x; bufferX += NUM_X_PIXELS_TO_SIMD)
|
for (i32 bufferX = min.x; bufferX < max.x; bufferX += NUM_X_PIXELS_TO_SIMD)
|
||||||
{
|
{
|
||||||
#if UNROLL_LOOP
|
#if INLINE_RASTERISE
|
||||||
// Rasterise buffer(X, Y) pixel
|
// Rasterise buffer(X, Y) pixel
|
||||||
{
|
{
|
||||||
__m128 checkArea = signedArea1;
|
__m128 checkArea = signedArea1;
|
||||||
@ -997,108 +1017,64 @@ FILE_SCOPE void SIMDTriangle(DTRRenderContext context,
|
|||||||
((f32 *)&barycentricZ)[2];
|
((f32 *)&barycentricZ)[2];
|
||||||
|
|
||||||
i32 zBufferIndex = posX + (posY * zBufferPitch);
|
i32 zBufferIndex = posX + (posY * zBufferPitch);
|
||||||
context.api->LockAcquire(renderBuffer->renderLock);
|
__m128 finalColor = simdColor;
|
||||||
|
if (!ignoreLight)
|
||||||
|
{
|
||||||
|
__m128 barycentricA_4x = _mm_set_ps1(((f32 *)&barycentric)[0]);
|
||||||
|
__m128 barycentricB_4x = _mm_set_ps1(((f32 *)&barycentric)[1]);
|
||||||
|
__m128 barycentricC_4x = _mm_set_ps1(((f32 *)&barycentric)[2]);
|
||||||
|
|
||||||
|
__m128 barycentricLight1 = _mm_mul_ps(p1Light, barycentricA_4x);
|
||||||
|
__m128 barycentricLight2 = _mm_mul_ps(p2Light, barycentricB_4x);
|
||||||
|
__m128 barycentricLight3 = _mm_mul_ps(p3Light, barycentricC_4x);
|
||||||
|
|
||||||
|
__m128 light = _mm_add_ps(barycentricLight3,
|
||||||
|
_mm_add_ps(barycentricLight1, barycentricLight2));
|
||||||
|
|
||||||
|
finalColor = _mm_mul_ps(finalColor, light);
|
||||||
|
((f32 *)&finalColor)[3] = preserveAlpha;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (texture)
|
||||||
|
{
|
||||||
|
__m128 texSampledColor = SIMDSampleTextureForTriangle(
|
||||||
|
texture, uv1, uv2SubUv1, uv3SubUv1, barycentric);
|
||||||
|
finalColor = _mm_mul_ps(texSampledColor, finalColor);
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
bool currLockValue;
|
||||||
|
do
|
||||||
|
{
|
||||||
|
currLockValue = (bool)context.api->AtomicCompareSwap(
|
||||||
|
(u32 *)&renderBuffer->pixelLockTable[zBufferIndex], (u32)true, (u32)false);
|
||||||
|
} while (currLockValue != false);
|
||||||
|
#endif
|
||||||
|
|
||||||
if (pixelZDepth > renderBuffer->zBuffer[zBufferIndex])
|
if (pixelZDepth > renderBuffer->zBuffer[zBufferIndex])
|
||||||
{
|
{
|
||||||
renderBuffer->zBuffer[zBufferIndex] = pixelZDepth;
|
renderBuffer->zBuffer[zBufferIndex] = pixelZDepth;
|
||||||
|
|
||||||
__m128 finalColor = simdColor;
|
|
||||||
if (!ignoreLight)
|
|
||||||
{
|
|
||||||
__m128 barycentricA_4x = _mm_set_ps1(((f32 *)&barycentric)[0]);
|
|
||||||
__m128 barycentricB_4x = _mm_set_ps1(((f32 *)&barycentric)[1]);
|
|
||||||
__m128 barycentricC_4x = _mm_set_ps1(((f32 *)&barycentric)[2]);
|
|
||||||
|
|
||||||
__m128 barycentricLight1 = _mm_mul_ps(p1Light, barycentricA_4x);
|
|
||||||
__m128 barycentricLight2 = _mm_mul_ps(p2Light, barycentricB_4x);
|
|
||||||
__m128 barycentricLight3 = _mm_mul_ps(p3Light, barycentricC_4x);
|
|
||||||
|
|
||||||
__m128 light =
|
|
||||||
_mm_add_ps(barycentricLight3,
|
|
||||||
_mm_add_ps(barycentricLight1, barycentricLight2));
|
|
||||||
|
|
||||||
finalColor = _mm_mul_ps(finalColor, light);
|
|
||||||
((f32 *)&finalColor)[3] = preserveAlpha;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (texture)
|
|
||||||
{
|
|
||||||
__m128 texSampledColor = SIMDSampleTextureForTriangle(texture, uv1, uv2SubUv1, uv3SubUv1, barycentric);
|
|
||||||
finalColor = _mm_mul_ps(texSampledColor, finalColor);
|
|
||||||
}
|
|
||||||
SIMDSetPixel(context, posX, posY, finalColor, ColorSpace_Linear);
|
SIMDSetPixel(context, posX, posY, finalColor, ColorSpace_Linear);
|
||||||
}
|
}
|
||||||
context.api->LockRelease(renderBuffer->renderLock);
|
renderBuffer->pixelLockTable[zBufferIndex] = false;
|
||||||
DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_RasterisePixel);
|
DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_RasterisePixel);
|
||||||
}
|
}
|
||||||
signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX);
|
signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Rasterise buffer(X + 1, Y) pixel
|
|
||||||
{
|
|
||||||
__m128 checkArea = signedArea2;
|
|
||||||
__m128 isGreater = _mm_cmpge_ps(checkArea, ZERO_4X);
|
|
||||||
i32 isGreaterResult = _mm_movemask_ps(isGreater);
|
|
||||||
i32 posX = bufferX + 1;
|
|
||||||
i32 posY = bufferY;
|
|
||||||
if ((isGreaterResult & IS_GREATER_MASK) == IS_GREATER_MASK && posX < max.x)
|
|
||||||
{
|
|
||||||
__m128 barycentric = _mm_mul_ps(checkArea, invSignedAreaParallelogram_4x);
|
|
||||||
__m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric);
|
|
||||||
|
|
||||||
f32 pixelZDepth = ((f32 *)&barycentricZ)[0] +
|
|
||||||
((f32 *)&barycentricZ)[1] +
|
|
||||||
((f32 *)&barycentricZ)[2];
|
|
||||||
i32 zBufferIndex = posX + (posY * zBufferPitch);
|
|
||||||
context.api->LockAcquire(renderBuffer->renderLock);
|
|
||||||
if (pixelZDepth > renderBuffer->zBuffer[zBufferIndex])
|
|
||||||
{
|
|
||||||
renderBuffer->zBuffer[zBufferIndex] = pixelZDepth;
|
|
||||||
|
|
||||||
__m128 finalColor = simdColor;
|
|
||||||
if (!ignoreLight)
|
|
||||||
{
|
|
||||||
__m128 barycentricA_4x = _mm_set_ps1(((f32 *)&barycentric)[0]);
|
|
||||||
__m128 barycentricB_4x = _mm_set_ps1(((f32 *)&barycentric)[1]);
|
|
||||||
__m128 barycentricC_4x = _mm_set_ps1(((f32 *)&barycentric)[2]);
|
|
||||||
|
|
||||||
__m128 barycentricLight1 = _mm_mul_ps(p1Light, barycentricA_4x);
|
|
||||||
__m128 barycentricLight2 = _mm_mul_ps(p2Light, barycentricB_4x);
|
|
||||||
__m128 barycentricLight3 = _mm_mul_ps(p3Light, barycentricC_4x);
|
|
||||||
|
|
||||||
__m128 light =
|
|
||||||
_mm_add_ps(barycentricLight3,
|
|
||||||
_mm_add_ps(barycentricLight1, barycentricLight2));
|
|
||||||
|
|
||||||
finalColor = _mm_mul_ps(finalColor, light);
|
|
||||||
((f32 *)&finalColor)[3] = preserveAlpha;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (texture)
|
|
||||||
{
|
|
||||||
__m128 texSampledColor = SIMDSampleTextureForTriangle(texture, uv1, uv2SubUv1, uv3SubUv1, barycentric);
|
|
||||||
finalColor = _mm_mul_ps(texSampledColor, finalColor);
|
|
||||||
}
|
|
||||||
SIMDSetPixel(context, posX, posY, finalColor, ColorSpace_Linear);
|
|
||||||
}
|
|
||||||
context.api->LockRelease(renderBuffer->renderLock);
|
|
||||||
}
|
|
||||||
signedArea2 = _mm_add_ps(signedArea2, signedAreaPixelDeltaX);
|
|
||||||
}
|
|
||||||
#else
|
#else
|
||||||
SIMDRasteriseTrianglePixel(renderBuffer, texture, bufferX, bufferY, max.x, uv1, uv2SubUv1,
|
SIMDRasteriseTrianglePixel(context, texture, bufferX, bufferY, max.x, uv1, uv2SubUv1,
|
||||||
uv3SubUv1, simdColor, triangleZ, signedArea1,
|
uv3SubUv1, simdColor, triangleZ, signedArea1,
|
||||||
invSignedAreaParallelogram_4x);
|
invSignedAreaParallelogram_4x, preserveAlpha, ignoreLight,
|
||||||
SIMDRasteriseTrianglePixel(renderBuffer, texture, bufferX + 1, bufferY, max.x, uv1, uv2SubUv1,
|
p1Light, p2Light, p3Light);
|
||||||
uv3SubUv1, simdColor, triangleZ, signedArea2,
|
|
||||||
invSignedAreaParallelogram_4x);
|
|
||||||
signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX);
|
signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX);
|
||||||
signedArea2 = _mm_add_ps(signedArea2, signedAreaPixelDeltaX);
|
// signedArea2 = _mm_add_ps(signedArea2, signedAreaPixelDeltaX);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
signedAreaPixel1 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaY);
|
signedAreaPixel1 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaY);
|
||||||
signedAreaPixel2 = _mm_add_ps(signedAreaPixel2, signedAreaPixelDeltaY);
|
// signedAreaPixel2 = _mm_add_ps(signedAreaPixel2, signedAreaPixelDeltaY);
|
||||||
}
|
}
|
||||||
|
|
||||||
DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Rasterise);
|
DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle_Rasterise);
|
||||||
DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle);
|
DEBUG_SIMD_AUTO_CHOOSE_END_CYCLE_COUNT(Triangle);
|
||||||
}
|
}
|
||||||
|
@ -20,6 +20,8 @@ typedef struct DTRRenderBuffer
|
|||||||
volatile u8 *memory; // Format: XX RR GG BB, and has (width * height * bytesPerPixels) elements
|
volatile u8 *memory; // Format: XX RR GG BB, and has (width * height * bytesPerPixels) elements
|
||||||
volatile f32 *zBuffer; // zBuffer has (width * height) elements
|
volatile f32 *zBuffer; // zBuffer has (width * height) elements
|
||||||
|
|
||||||
|
volatile bool *pixelLockTable; // has (width * height) elements
|
||||||
|
|
||||||
} DTRRenderBuffer;
|
} DTRRenderBuffer;
|
||||||
|
|
||||||
// Using transforms for 2D ignores the 'z' element.
|
// Using transforms for 2D ignores the 'z' element.
|
||||||
@ -76,9 +78,10 @@ typedef struct DTRRenderLight
|
|||||||
|
|
||||||
typedef struct DTRRenderContext
|
typedef struct DTRRenderContext
|
||||||
{
|
{
|
||||||
DTRRenderBuffer *renderBuffer;
|
DTRRenderBuffer *renderBuffer;
|
||||||
DqnMemStack *tempStack;
|
DqnMemStack *tempStack;
|
||||||
PlatformAPI *api;
|
PlatformAPI *api;
|
||||||
|
PlatformJobQueue *jobQueue;
|
||||||
} DTRRenderContext;
|
} DTRRenderContext;
|
||||||
|
|
||||||
// NOTE: All colors should be in the range of [0->1] where DqnV4 is a struct with 4 floats, rgba
|
// NOTE: All colors should be in the range of [0->1] where DqnV4 is a struct with 4 floats, rgba
|
||||||
|
@ -8,6 +8,18 @@
|
|||||||
#define UNICODE
|
#define UNICODE
|
||||||
#define _UNICODE
|
#define _UNICODE
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Platform Atomics
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
u32 Platform_AtomicCompareSwap(u32 *volatile dest, u32 swapVal, u32 compareVal)
|
||||||
|
{
|
||||||
|
// TODO(doyle): Compile time assert
|
||||||
|
DQN_ASSERT(sizeof(LONG) == sizeof(u32));
|
||||||
|
u32 result =
|
||||||
|
(u32)InterlockedCompareExchange((LONG volatile *)dest, (LONG)swapVal, (LONG)compareVal);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Platform Mutex/Lock
|
// Platform Mutex/Lock
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -708,6 +720,8 @@ int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPWSTR lpCmdLi
|
|||||||
platformAPI.QueueTryExecuteNextJob = Platform_QueueTryExecuteNextJob;
|
platformAPI.QueueTryExecuteNextJob = Platform_QueueTryExecuteNextJob;
|
||||||
platformAPI.QueueAllJobsComplete = Platform_QueueAllJobsComplete;
|
platformAPI.QueueAllJobsComplete = Platform_QueueAllJobsComplete;
|
||||||
|
|
||||||
|
platformAPI.AtomicCompareSwap = Platform_AtomicCompareSwap;
|
||||||
|
|
||||||
platformAPI.LockInit = Platform_LockInit;
|
platformAPI.LockInit = Platform_LockInit;
|
||||||
platformAPI.LockAcquire = Platform_LockAcquire;
|
platformAPI.LockAcquire = Platform_LockAcquire;
|
||||||
platformAPI.LockRelease = Platform_LockRelease;
|
platformAPI.LockRelease = Platform_LockRelease;
|
||||||
@ -903,8 +917,7 @@ int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPWSTR lpCmdLi
|
|||||||
|
|
||||||
if (dllCode.DTR_Update)
|
if (dllCode.DTR_Update)
|
||||||
{
|
{
|
||||||
dllCode.DTR_Update(&platformBuffer, &platformInput,
|
dllCode.DTR_Update(&platformBuffer, &platformInput, &globalPlatformMemory);
|
||||||
&globalPlatformMemory);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user