Accelerate textured triangle rendering using SIMD

This commit is contained in:
Doyle Thai 2017-05-30 17:41:05 +10:00
parent 4d2a7a7c06
commit 49270a2826
7 changed files with 324 additions and 132 deletions

View File

@ -1104,8 +1104,8 @@ extern "C" void DTR_Update(PlatformRenderBuffer *const platformRenderBuffer,
} }
else else
{ {
DTRRender_TexturedTriangle(&renderBuffer, screenVA, screenVB, screenVC, texA, texB, DTRRender_TexturedTriangle(input, &renderBuffer, screenVA, screenVB, screenVC, texA,
texC, &state->mesh.tex, modelCol); texB, texC, &state->mesh.tex, modelCol);
} }
bool DEBUG_WIREFRAME = false; bool DEBUG_WIREFRAME = false;

View File

@ -17,14 +17,23 @@ void DTRAsset_InitGlobalState()
stbi_set_flip_vertically_on_load(true); stbi_set_flip_vertically_on_load(true);
} }
FILE_SCOPE void MemcopyInternal(u8 *dest, u8 *src, size_t numBytes) FILE_SCOPE void MemcopyInternal(u8 *const dest, u8 *const src, size_t numBytes)
{ {
if (!dest || !src || numBytes == 0) return; if (!dest || !src || numBytes == 0) return;
for (size_t i = 0; i < numBytes; i++) for (size_t i = 0; i < numBytes; i++)
dest[i] = src[i]; dest[i] = src[i];
} }
FILE_SCOPE void AssetDqnArrayMemAPICallback(DqnMemAPICallbackInfo info, DqnMemAPICallbackResult *result) // NOTE: Dynamic array allocations just requests space at the first option it
// can take. Realloc will reallocate in place if there's space. Otherwise
// it'll create a new block and reallocate there by copying the old data over.
// So this does waste space. But is a quick way to reroute allocations into
// a MemStack. It's main intended purpose is for one-shot loading data that you
// don't know how much space you need in your DArray. After filling out
// the dynamic array you then compact the data manually using memcopys into
// a new block and discard the old data.
FILE_SCOPE void DumbDynamicArrayMemAPICallback(DqnMemAPICallbackInfo info, DqnMemAPICallbackResult *result)
{ {
DQN_ASSERT(info.type != DqnMemAPICallbackType_Invalid); DQN_ASSERT(info.type != DqnMemAPICallbackType_Invalid);
DqnMemStack *stack = static_cast<DqnMemStack *>(info.userContext); DqnMemStack *stack = static_cast<DqnMemStack *>(info.userContext);
@ -40,27 +49,7 @@ FILE_SCOPE void AssetDqnArrayMemAPICallback(DqnMemAPICallbackInfo info, DqnMemAP
case DqnMemAPICallbackType_Free: case DqnMemAPICallbackType_Free:
{ {
DqnMemStackBlock **blockPtr = &stack->block; DQN_ASSERT(DQN_INVALID_CODE_PATH);
while (*blockPtr && (*blockPtr)->memory != info.ptrToFree)
{
// NOTE(doyle): Ensure that the base ptr of each block is always
// actually aligned so we don't ever miss finding the block if
// the allocator had to realign the pointer from the base
// address.
if (DTR_DEBUG)
{
size_t memBaseAddr = (size_t)((*blockPtr)->memory);
DQN_ASSERT(DQN_ALIGN_POW_N(memBaseAddr, stack->byteAlign) ==
memBaseAddr);
}
blockPtr = &((*blockPtr)->prevBlock);
}
DQN_ASSERT(*blockPtr && (*blockPtr)->memory == info.ptrToFree);
DqnMemStackBlock *blockToFree = *blockPtr;
*blockPtr = blockToFree->prevBlock;
DqnMem_Free(blockToFree);
} }
break; break;
@ -189,7 +178,7 @@ bool DTRAsset_LoadWavefrontObj(const PlatformAPI api, DqnMemStack *const memStac
size_t fileSize = file.size; size_t fileSize = file.size;
DqnMemAPI memAPI = {}; DqnMemAPI memAPI = {};
memAPI.callback = AssetDqnArrayMemAPICallback; memAPI.callback = DumbDynamicArrayMemAPICallback;
memAPI.userContext = memStack; memAPI.userContext = memStack;
enum WavefVertexType { enum WavefVertexType {
@ -199,6 +188,23 @@ bool DTRAsset_LoadWavefrontObj(const PlatformAPI api, DqnMemStack *const memStac
WavefVertexType_Normal, WavefVertexType_Normal,
}; };
// TODO(doyle): We should profile, reading it out to WavefModel format and
// then copying it over, versus just reading the file twice. First pass is
// to count the number of vertexes etc. for each section we need. Then the
// second pass we can allocate directly the number we need and reparse it.
// I have a feeling that, in general that's a better idea, atleast it gets
// rid of alot of stupid copying code and memstack juggling.
// NOTE(doyle): We pre-process the data into an intermediate format that
// more accurately represents the file format. Since there's no metadata
// inside Wavefront objects, we don't know how many vertexes/texUV/normals
// there are- which makes it hard to allocate "nicely" out of our memory
// stack.
// So we preprocess. Then once we know the final amount, copy over the data
// to a new memstack block such that all the data is compacted together in
// memory for locality. Then just throw away the intermediate
// representation.
WavefModel dummy_ = {}; WavefModel dummy_ = {};
WavefModel *obj = &dummy_; WavefModel *obj = &dummy_;
@ -410,11 +416,17 @@ bool DTRAsset_LoadWavefrontObj(const PlatformAPI api, DqnMemStack *const memStac
DQN_ASSERT(obj->groupNameIndex + 1 < DQN_ARRAY_COUNT(obj->groupName)); DQN_ASSERT(obj->groupNameIndex + 1 < DQN_ARRAY_COUNT(obj->groupName));
DQN_ASSERT(!obj->groupName[obj->groupNameIndex]); DQN_ASSERT(!obj->groupName[obj->groupNameIndex]);
// TODO(doyle): Broken since I don't "copy" it over to our
// final DTRMesh. Below I copy over the data so that all the
// allocations are compacted together but don't copy this
// yet. Which means the name gets trashed atm.
#if 0
obj->groupName[obj->groupNameIndex++] = obj->groupName[obj->groupNameIndex++] =
(char *)DqnMemStack_Push(memStack, (nameLen + 1) * sizeof(char)); (char *)DqnMemStack_Push(memStack, (nameLen + 1) * sizeof(char));
for (i32 i = 0; i < nameLen; i++) for (i32 i = 0; i < nameLen; i++)
obj->groupName[obj->groupNameIndex - 1][i] = namePtr[i]; obj->groupName[obj->groupNameIndex - 1][i] = namePtr[i];
#endif
while (scan && (*scan == ' ' || *scan == '\n')) while (scan && (*scan == ' ' || *scan == '\n'))
scan++; scan++;

View File

@ -121,7 +121,9 @@ void inline DTRDebug_BeginCycleCount(enum DTRDebugCycleCount tag)
{ {
if (globalDebug.input && globalDebug.input->canUseRdtsc) if (globalDebug.input && globalDebug.input->canUseRdtsc)
{ {
globalDebug.cycleCount[tag] = __rdtsc(); DTRDebugCycles *const cycles = &globalDebug.cycles[tag];
cycles->tmpStartCycles = __rdtsc();
cycles->numInvokes++;
} }
} }
} }
@ -132,7 +134,8 @@ void inline DTRDebug_EndCycleCount(enum DTRDebugCycleCount tag)
{ {
if (globalDebug.input && globalDebug.input->canUseRdtsc) if (globalDebug.input && globalDebug.input->canUseRdtsc)
{ {
globalDebug.cycleCount[tag] = __rdtsc() - globalDebug.cycleCount[tag]; DTRDebugCycles *const cycles = &globalDebug.cycles[tag];
cycles->totalCycles += __rdtsc() - cycles->tmpStartCycles;
} }
} }
} }
@ -214,9 +217,16 @@ void DTRDebug_Update(DTRState *const state,
DTRDebug_PushText("TrianglesRendered: %'lld", debug->counter[DTRDebugCounter_RenderTriangle]); DTRDebug_PushText("TrianglesRendered: %'lld", debug->counter[DTRDebugCounter_RenderTriangle]);
DTRDebug_PushText(""); DTRDebug_PushText("");
for (i32 i = 0; i < DQN_ARRAY_COUNT(debug->cycleCount); i++) DTRDebugCycles emptyDebugCycles = {};
for (i32 i = 0; i < DQN_ARRAY_COUNT(debug->cycles); i++)
{ {
DTRDebug_PushText("%d: %'lld cycles", i, debug->cycleCount[i]); DTRDebugCycles *const cycles = &globalDebug.cycles[i];
u64 invocations = (cycles->numInvokes == 0) ? 1 : cycles->numInvokes;
u64 avgCycles = cycles->totalCycles / invocations;
DTRDebug_PushText("%d: %'lld avg cycles", i, avgCycles);
*cycles = emptyDebugCycles;
} }
DTRDebug_PushText(""); DTRDebug_PushText("");

View File

@ -47,6 +47,14 @@ enum DTRDebugCycleCount
DTRDebugCycleCount_Count, DTRDebugCycleCount_Count,
}; };
typedef struct DTRDebugCycles
{
u64 totalCycles;
u64 numInvokes;
u64 tmpStartCycles; // Used to calculate the number of cycles elapsed
} DTRDebugCycles;
typedef struct DTRDebug typedef struct DTRDebug
{ {
struct DTRFont *font; struct DTRFont *font;
@ -57,9 +65,9 @@ typedef struct DTRDebug
DqnV2 displayP; DqnV2 displayP;
i32 displayYOffset; i32 displayYOffset;
u64 cycleCount[DTRDebugCycleCount_Count]; DTRDebugCycles cycles [DTRDebugCycleCount_Count];
u64 counter [DTRDebugCounter_Count]; u64 counter[DTRDebugCounter_Count];
u64 totalSetPixels; u64 totalSetPixels;
} DTRDebug; } DTRDebug;
extern DTRDebug globalDebug; extern DTRDebug globalDebug;

View File

@ -7,6 +7,8 @@
#include "external/stb_rect_pack.h" #include "external/stb_rect_pack.h"
#include "external/stb_truetype.h" #include "external/stb_truetype.h"
#include <intrin.h>
FILE_SCOPE const f32 COLOR_EPSILON = 0.9f; FILE_SCOPE const f32 COLOR_EPSILON = 0.9f;
FILE_SCOPE inline DqnV4 PreMultiplyAlpha1(const DqnV4 color) FILE_SCOPE inline DqnV4 PreMultiplyAlpha1(const DqnV4 color)
@ -497,7 +499,8 @@ FILE_SCOPE void DebugBarycentricInternal(DqnV2 p, DqnV2 a, DqnV2 b, DqnV2 c, f32
*u = 1.0f - *v - *w; *u = 1.0f - *v - *w;
} }
void DTRRender_TexturedTriangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3, void DTRRender_TexturedTriangle(PlatformInput *const input,
DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3,
DqnV2 uv1, DqnV2 uv2, DqnV2 uv3, DTRBitmap *const texture, DqnV2 uv1, DqnV2 uv2, DqnV2 uv3, DTRBitmap *const texture,
DqnV4 color, const DTRRenderTransform transform) DqnV4 color, const DTRRenderTransform transform)
{ {
@ -549,125 +552,287 @@ void DTRRender_TexturedTriangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, D
const DqnV3 b = p2; const DqnV3 b = p2;
const DqnV3 c = p3; const DqnV3 c = p3;
DqnV2i startP = min;
f32 oldSignedArea1 = ((b.x - a.x) * (startP.y - a.y)) - ((b.y - a.y) * (startP.x - a.x));
f32 oldSignedArea2 = ((c.x - b.x) * (startP.y - b.y)) - ((c.y - b.y) * (startP.x - b.x));
f32 oldSignedArea3 = ((a.x - c.x) * (startP.y - c.y)) - ((a.y - c.y) * (startP.x - c.x));
f32 signedArea1 = ((b.x - a.x) * (startP.y - a.y)) - ((b.y - a.y) * (startP.x - a.x));
f32 signedArea1DeltaX = a.y - b.y;
f32 signedArea1DeltaY = b.x - a.x;
f32 signedArea2 = ((c.x - b.x) * (startP.y - b.y)) - ((c.y - b.y) * (startP.x - b.x));
f32 signedArea2DeltaX = b.y - c.y;
f32 signedArea2DeltaY = c.x - b.x;
f32 signedArea3 = ((a.x - c.x) * (startP.y - c.y)) - ((a.y - c.y) * (startP.x - c.x));
f32 signedArea3DeltaX = c.y - a.y;
f32 signedArea3DeltaY = a.x - c.x;
f32 signedAreaParallelogram = signedArea1 + signedArea2 + signedArea3;
if (signedAreaParallelogram == 0) return;
f32 invSignedAreaParallelogram = 1 / signedAreaParallelogram;
DTRDebug_BeginCycleCount(DTRDebugCycleCount_RenderTriangle_Rasterise); DTRDebug_BeginCycleCount(DTRDebugCycleCount_RenderTriangle_Rasterise);
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// Scan and Render // Scan and Render
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
const u32 zBufferPitch = renderBuffer->width; const u32 zBufferPitch = renderBuffer->width;
const f32 BARYCENTRIC_EPSILON = 0.1f; if (input->canUseSSE2)
u8 *texturePtr = texture->memory;
const u32 texturePitch = texture->bytesPerPixel * texture->dim.w;
for (i32 bufferY = min.y; bufferY < max.y; bufferY++)
{ {
f32 signedArea1Row = signedArea1; DqnV2i startP = min;
f32 signedArea2Row = signedArea2; f32 edge1SignedAreaPixel1 = ((b.x - a.x) * (startP.y - a.y)) - ((b.y - a.y) * (startP.x - a.x));
f32 signedArea3Row = signedArea3; f32 edge1SignedAreaPixel1DeltaX = a.y - b.y;
f32 edge1SignedAreaPixel1DeltaY = b.x - a.x;
for (i32 bufferX = min.x; bufferX < max.x; bufferX++) f32 edge2SignedAreaPixel1 = ((c.x - b.x) * (startP.y - b.y)) - ((c.y - b.y) * (startP.x - b.x));
f32 edge2SignedAreaPixel1DeltaX = b.y - c.y;
f32 edge2SignedAreaPixel1DeltaY = c.x - b.x;
f32 edge3SignedAreaPixel1 = ((a.x - c.x) * (startP.y - c.y)) - ((a.y - c.y) * (startP.x - c.x));
f32 edge3SignedAreaPixel1DeltaX = c.y - a.y;
f32 edge3SignedAreaPixel1DeltaY = a.x - c.x;
f32 signedAreaParallelogramPixel1 = edge1SignedAreaPixel1 + edge2SignedAreaPixel1 + edge3SignedAreaPixel1;
if (signedAreaParallelogramPixel1 == 0) return;
f32 invSignedAreaParallelogramPixel1 = 1 / signedAreaParallelogramPixel1;
__m128 zero_4x = _mm_set_ps1(0.0f);
__m128 two_4x = _mm_set_ps1(2.0f);
__m128 invSignedAreaParallelogram4x = _mm_set_ps1(invSignedAreaParallelogramPixel1);
__m128 triangleZ = _mm_set_ps(0, b.z, a.z, c.z);
__m128 signedAreaPixelDeltaX = _mm_set_ps(0, edge3SignedAreaPixel1DeltaX, edge2SignedAreaPixel1DeltaX, edge1SignedAreaPixel1DeltaX);
__m128 signedAreaPixelDeltaY = _mm_set_ps(0, edge3SignedAreaPixel1DeltaY, edge2SignedAreaPixel1DeltaY, edge1SignedAreaPixel1DeltaY);
__m128 signedAreaPixel1 = _mm_set_ps(0, edge3SignedAreaPixel1, edge2SignedAreaPixel1, edge1SignedAreaPixel1);
__m128 signedAreaPixel2 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaX);
// NOTE: Step size of 2 pixels across X
signedAreaPixelDeltaX = _mm_mul_ps(signedAreaPixelDeltaX, two_4x);
const DqnV2 uv2SubUv1 = uv2 - uv1;
const DqnV2 uv3SubUv1 = uv3 - uv1;
const u32 IS_GREATER_MASK = 0xF;
for (i32 bufferY = min.y; bufferY < max.y; bufferY++)
{ {
if (signedArea1Row >= 0 && signedArea2Row >= 0 && signedArea3Row >= 0) __m128 signedArea1 = signedAreaPixel1;
__m128 signedArea2 = signedAreaPixel2;
for (i32 bufferX = min.x; bufferX < max.x; bufferX += 2)
{ {
f32 barycentricB = signedArea3Row * invSignedAreaParallelogram; __m128 isGreater1 = _mm_cmpge_ps(signedArea1, zero_4x);
f32 barycentricC = signedArea1Row * invSignedAreaParallelogram; i32 isGreaterResult1 = _mm_movemask_ps(isGreater1);
if ((isGreaterResult1 & IS_GREATER_MASK) == IS_GREATER_MASK)
if (DTR_DEBUG)
{ {
const f32 EPSILON = 0.1f; __m128 barycentric = _mm_mul_ps(signedArea1, invSignedAreaParallelogram4x);
__m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric);
f32 debugSignedArea1 = ((b.x - a.x) * (bufferY - a.y)) - ((b.y - a.y) * (bufferX - a.x)); i32 zBufferIndex = bufferX + (bufferY * zBufferPitch);
f32 debugSignedArea2 = ((c.x - b.x) * (bufferY - b.y)) - ((c.y - b.y) * (bufferX - b.x)); f32 pixelZValue = ((f32 *)&barycentricZ)[0] +
f32 debugSignedArea3 = ((a.x - c.x) * (bufferY - c.y)) - ((a.y - c.y) * (bufferX - c.x)); ((f32 *)&barycentricZ)[1] +
((f32 *)&barycentricZ)[2];
f32 currZValue = renderBuffer->zBuffer[zBufferIndex];
if (pixelZValue > currZValue)
{
renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
u8 *texturePtr = texture->memory;
const u32 texturePitch = texture->bytesPerPixel * texture->dim.w;
f32 deltaSignedArea1 = debugSignedArea1 - signedArea1Row; f32 barycentricB = ((f32 *)&barycentric)[2];
f32 deltaSignedArea2 = debugSignedArea2 - signedArea2Row; f32 barycentricC = ((f32 *)&barycentric)[0];
f32 deltaSignedArea3 = debugSignedArea3 - signedArea3Row; DqnV2 uv = uv1 + (uv2SubUv1 * barycentricB) + (uv3SubUv1 * barycentricC);
DQN_ASSERT(deltaSignedArea1 < EPSILON && deltaSignedArea2 < EPSILON &&
deltaSignedArea3 < EPSILON)
f32 debugBarycentricA, debugBarycentricB, debugBarycentricC; const f32 EPSILON = 0.1f;
DebugBarycentricInternal(DqnV2_2i(bufferX, bufferY), a.xy, b.xy, c.xy, DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON);
&debugBarycentricA, &debugBarycentricB, DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON);
&debugBarycentricC); uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f);
uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f);
f32 texelXf = uv.x * texture->dim.w;
f32 texelYf = uv.y * texture->dim.h;
DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w);
DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h);
f32 deltaBaryB = DQN_ABS(barycentricB - debugBarycentricB); i32 texelX = (i32)texelXf;
f32 deltaBaryC = DQN_ABS(barycentricC - debugBarycentricC); i32 texelY = (i32)texelYf;
u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) +
(texelY * texturePitch));
DqnV4 color1;
color1.a = (f32)(texel1 >> 24);
color1.b = (f32)((texel1 >> 16) & 0xFF);
color1.g = (f32)((texel1 >> 8) & 0xFF);
color1.r = (f32)((texel1 >> 0) & 0xFF);
color1 *= DTRRENDER_INV_255;
color1 = DTRRender_SRGB1ToLinearSpaceV4(color1);
DqnV4 blend = color * color1;
SetPixel(renderBuffer, bufferX, bufferY, blend, ColorSpace_Linear);
}
DQN_ASSERT(deltaBaryB < EPSILON && deltaBaryC < EPSILON)
} }
i32 zBufferIndex = bufferX + (bufferY * zBufferPitch); __m128 isGreater2 = _mm_cmpge_ps(signedArea2, zero_4x);
f32 pixelZValue = a.z + (barycentricB * (b.z - a.z)) + (barycentricC * (c.z - a.z)); i32 isGreaterResult2 = _mm_movemask_ps(isGreater2);
f32 currZValue = renderBuffer->zBuffer[zBufferIndex]; i32 bufferX1 = bufferX + 1;
DQN_ASSERT(zBufferIndex < (renderBuffer->width * renderBuffer->height)); if ((isGreaterResult2 & IS_GREATER_MASK) == IS_GREATER_MASK && bufferX1 < max.x)
if (pixelZValue > currZValue)
{ {
renderBuffer->zBuffer[zBufferIndex] = pixelZValue; __m128 barycentric = _mm_mul_ps(signedArea2, invSignedAreaParallelogram4x);
const bool DEBUG_SAMPLE_TEXTURE = true; __m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric);
DqnV2 uv = uv1 + ((uv2 - uv1) * barycentricB) + ((uv3 - uv1) * barycentricC);
const f32 EPSILON = 0.1f; i32 zBufferIndex = bufferX1 + (bufferY * zBufferPitch);
DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON); f32 pixelZValue = ((f32 *)&barycentricZ)[0] +
DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON); ((f32 *)&barycentricZ)[1] +
((f32 *)&barycentricZ)[2];
f32 currZValue = renderBuffer->zBuffer[zBufferIndex];
if (pixelZValue > currZValue)
{
renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
u8 *texturePtr = texture->memory;
const u32 texturePitch = texture->bytesPerPixel * texture->dim.w;
uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f); f32 barycentricB = ((f32 *)&barycentric)[2];
uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f); f32 barycentricC = ((f32 *)&barycentric)[0];
DqnV2 uv = uv1 + (uv2SubUv1 * barycentricB) + (uv3SubUv1 * barycentricC);
f32 texelXf = uv.x * texture->dim.w; const f32 EPSILON = 0.1f;
f32 texelYf = uv.y * texture->dim.h; DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON);
DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w); DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON);
DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h); uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f);
uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f);
i32 texelX = (i32)texelXf; f32 texelXf = uv.x * texture->dim.w;
i32 texelY = (i32)texelYf; f32 texelYf = uv.y * texture->dim.h;
DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w);
DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h);
u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) + i32 texelX = (i32)texelXf;
(texelY * texturePitch)); i32 texelY = (i32)texelYf;
DqnV4 color1; u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) +
color1.a = (f32)(texel1 >> 24); (texelY * texturePitch));
color1.b = (f32)((texel1 >> 16) & 0xFF);
color1.g = (f32)((texel1 >> 8) & 0xFF);
color1.r = (f32)((texel1 >> 0) & 0xFF);
color1 *= DTRRENDER_INV_255; DqnV4 color1;
color1 = DTRRender_SRGB1ToLinearSpaceV4(color1); color1.a = (f32)(texel1 >> 24);
DqnV4 blend = color * color1; color1.b = (f32)((texel1 >> 16) & 0xFF);
SetPixel(renderBuffer, bufferX, bufferY, blend, ColorSpace_Linear); color1.g = (f32)((texel1 >> 8) & 0xFF);
color1.r = (f32)((texel1 >> 0) & 0xFF);
color1 *= DTRRENDER_INV_255;
color1 = DTRRender_SRGB1ToLinearSpaceV4(color1);
DqnV4 blend = color * color1;
SetPixel(renderBuffer, bufferX1, bufferY, blend, ColorSpace_Linear);
}
} }
signedArea1 = _mm_add_ps(signedArea1, signedAreaPixelDeltaX);
signedArea2 = _mm_add_ps(signedArea2, signedAreaPixelDeltaX);
} }
signedArea1Row += signedArea1DeltaX; signedAreaPixel1 = _mm_add_ps(signedAreaPixel1, signedAreaPixelDeltaY);
signedArea2Row += signedArea2DeltaX; signedAreaPixel2 = _mm_add_ps(signedAreaPixel2, signedAreaPixelDeltaY);
signedArea3Row += signedArea3DeltaX;
} }
}
else
{
DqnV2i startP = min;
f32 signedArea1 = ((b.x - a.x) * (startP.y - a.y)) - ((b.y - a.y) * (startP.x - a.x));
f32 signedArea1DeltaX = a.y - b.y;
f32 signedArea1DeltaY = b.x - a.x;
signedArea1 += signedArea1DeltaY; f32 signedArea2 = ((c.x - b.x) * (startP.y - b.y)) - ((c.y - b.y) * (startP.x - b.x));
signedArea2 += signedArea2DeltaY; f32 signedArea2DeltaX = b.y - c.y;
signedArea3 += signedArea3DeltaY; f32 signedArea2DeltaY = c.x - b.x;
f32 signedArea3 = ((a.x - c.x) * (startP.y - c.y)) - ((a.y - c.y) * (startP.x - c.x));
f32 signedArea3DeltaX = c.y - a.y;
f32 signedArea3DeltaY = a.x - c.x;
f32 signedAreaParallelogram = signedArea1 + signedArea2 + signedArea3;
if (signedAreaParallelogram == 0) return;
f32 invSignedAreaParallelogram = 1 / signedAreaParallelogram;
for (i32 bufferY = min.y; bufferY < max.y; bufferY++)
{
f32 signedArea1Row = signedArea1;
f32 signedArea2Row = signedArea2;
f32 signedArea3Row = signedArea3;
for (i32 bufferX = min.x; bufferX < max.x; bufferX++)
{
if (signedArea1Row >= 0 && signedArea2Row >= 0 && signedArea3Row >= 0)
{
f32 barycentricB = signedArea3Row * invSignedAreaParallelogram;
f32 barycentricC = signedArea1Row * invSignedAreaParallelogram;
if (DTR_DEBUG)
{
const f32 EPSILON = 0.1f;
f32 debugSignedArea1 = ((b.x - a.x) * (bufferY - a.y)) - ((b.y - a.y) * (bufferX - a.x));
f32 debugSignedArea2 = ((c.x - b.x) * (bufferY - b.y)) - ((c.y - b.y) * (bufferX - b.x));
f32 debugSignedArea3 = ((a.x - c.x) * (bufferY - c.y)) - ((a.y - c.y) * (bufferX - c.x));
f32 deltaSignedArea1 = DQN_ABS(debugSignedArea1 - signedArea1Row);
f32 deltaSignedArea2 = DQN_ABS(debugSignedArea2 - signedArea2Row);
f32 deltaSignedArea3 = DQN_ABS(debugSignedArea3 - signedArea3Row);
DQN_ASSERT(deltaSignedArea1 < EPSILON && deltaSignedArea2 < EPSILON &&
deltaSignedArea3 < EPSILON)
f32 debugBarycentricA, debugBarycentricB, debugBarycentricC;
DebugBarycentricInternal(DqnV2_2i(bufferX, bufferY), a.xy, b.xy, c.xy,
&debugBarycentricA, &debugBarycentricB,
&debugBarycentricC);
f32 deltaBaryB = DQN_ABS(barycentricB - debugBarycentricB);
f32 deltaBaryC = DQN_ABS(barycentricC - debugBarycentricC);
DQN_ASSERT(deltaBaryB < EPSILON && deltaBaryC < EPSILON)
}
i32 zBufferIndex = bufferX + (bufferY * zBufferPitch);
f32 pixelZValue =
a.z + (barycentricB * (b.z - a.z)) + (barycentricC * (c.z - a.z));
f32 currZValue = renderBuffer->zBuffer[zBufferIndex];
DQN_ASSERT(zBufferIndex < (renderBuffer->width * renderBuffer->height));
if (pixelZValue > currZValue)
{
renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
if (texture)
{
u8 *texturePtr = texture->memory;
const u32 texturePitch = texture->bytesPerPixel * texture->dim.w;
DqnV2 uv =
uv1 + ((uv2 - uv1) * barycentricB) + ((uv3 - uv1) * barycentricC);
const f32 EPSILON = 0.1f;
DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON);
DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON);
uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f);
uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f);
f32 texelXf = uv.x * texture->dim.w;
f32 texelYf = uv.y * texture->dim.h;
DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w);
DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h);
i32 texelX = (i32)texelXf;
i32 texelY = (i32)texelYf;
u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) +
(texelY * texturePitch));
DqnV4 color1;
color1.a = (f32)(texel1 >> 24);
color1.b = (f32)((texel1 >> 16) & 0xFF);
color1.g = (f32)((texel1 >> 8) & 0xFF);
color1.r = (f32)((texel1 >> 0) & 0xFF);
color1 *= DTRRENDER_INV_255;
color1 = DTRRender_SRGB1ToLinearSpaceV4(color1);
DqnV4 blend = color * color1;
SetPixel(renderBuffer, bufferX, bufferY, blend, ColorSpace_Linear);
}
else
{
SetPixel(renderBuffer, bufferX, bufferY, color, ColorSpace_Linear);
}
}
}
signedArea1Row += signedArea1DeltaX;
signedArea2Row += signedArea2DeltaX;
signedArea3Row += signedArea3DeltaX;
}
signedArea1 += signedArea1DeltaY;
signedArea2 += signedArea2DeltaY;
signedArea3 += signedArea3DeltaY;
}
} }
DTRDebug_EndCycleCount(DTRDebugCycleCount_RenderTriangle_Rasterise); DTRDebug_EndCycleCount(DTRDebugCycleCount_RenderTriangle_Rasterise);
@ -881,12 +1046,10 @@ void DTRRender_Triangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2,
if (signedAreaParallelogram == 0) return; if (signedAreaParallelogram == 0) return;
f32 invSignedAreaParallelogram = 1 / signedAreaParallelogram; f32 invSignedAreaParallelogram = 1 / signedAreaParallelogram;
DTRDebug_BeginCycleCount(DTRDebugCycleCount_RenderTriangle_Rasterise);
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// Scan and Render // Scan and Render
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
const u32 zBufferPitch = renderBuffer->width; const u32 zBufferPitch = renderBuffer->width;
const f32 BARYCENTRIC_EPSILON = 0.1f;
for (i32 bufferY = min.y; bufferY < max.y; bufferY++) for (i32 bufferY = min.y; bufferY < max.y; bufferY++)
{ {
f32 signedArea1Row = signedArea1; f32 signedArea1Row = signedArea1;
@ -903,6 +1066,7 @@ void DTRRender_Triangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2,
i32 zBufferIndex = bufferX + (bufferY * zBufferPitch); i32 zBufferIndex = bufferX + (bufferY * zBufferPitch);
f32 pixelZValue = a.z + (barycentricB * (b.z - a.z)) + (barycentricC * (c.z - a.z)); f32 pixelZValue = a.z + (barycentricB * (b.z - a.z)) + (barycentricC * (c.z - a.z));
f32 currZValue = renderBuffer->zBuffer[zBufferIndex]; f32 currZValue = renderBuffer->zBuffer[zBufferIndex];
DQN_ASSERT(zBufferIndex < (renderBuffer->width * renderBuffer->height));
if (pixelZValue > currZValue) if (pixelZValue > currZValue)
{ {
renderBuffer->zBuffer[zBufferIndex] = pixelZValue; renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
@ -919,7 +1083,6 @@ void DTRRender_Triangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2,
signedArea2 += signedArea2DeltaY; signedArea2 += signedArea2DeltaY;
signedArea3 += signedArea3DeltaY; signedArea3 += signedArea3DeltaY;
} }
DTRDebug_EndCycleCount(DTRDebugCycleCount_RenderTriangle_Rasterise);
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// Debug // Debug
@ -958,8 +1121,7 @@ void DTRRender_Triangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2,
} }
} }
void DTRRender_Bitmap(DTRRenderBuffer *const renderBuffer, void DTRRender_Bitmap(DTRRenderBuffer *const renderBuffer, DTRBitmap *const bitmap, DqnV2 pos,
DTRBitmap *const bitmap, DqnV2 pos,
const DTRRenderTransform transform, DqnV4 color) const DTRRenderTransform transform, DqnV4 color)
{ {
if (!bitmap || !bitmap->memory || !renderBuffer) return; if (!bitmap || !bitmap->memory || !renderBuffer) return;

View File

@ -63,7 +63,7 @@ void DTRRender_Text (DTRRenderBuffer *const renderBuffer, const DTRFo
void DTRRender_Line (DTRRenderBuffer *const renderBuffer, DqnV2i a, DqnV2i b, DqnV4 color); void DTRRender_Line (DTRRenderBuffer *const renderBuffer, DqnV2i a, DqnV2i b, DqnV4 color);
void DTRRender_Rectangle (DTRRenderBuffer *const renderBuffer, DqnV2 min, DqnV2 max, DqnV4 color, const DTRRenderTransform transform = DTRRender_DefaultTransform()); void DTRRender_Rectangle (DTRRenderBuffer *const renderBuffer, DqnV2 min, DqnV2 max, DqnV4 color, const DTRRenderTransform transform = DTRRender_DefaultTransform());
void DTRRender_Triangle (DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3, DqnV4 color, const DTRRenderTransform transform = DTRRender_DefaultTriangleTransform()); void DTRRender_Triangle (DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3, DqnV4 color, const DTRRenderTransform transform = DTRRender_DefaultTriangleTransform());
void DTRRender_TexturedTriangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3, DqnV2 uv1, DqnV2 uv2, DqnV2 uv3, DTRBitmap *const texture, DqnV4 color, const DTRRenderTransform transform = DTRRender_DefaultTriangleTransform()); void DTRRender_TexturedTriangle(PlatformInput *const input, DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3, DqnV2 uv1, DqnV2 uv2, DqnV2 uv3, DTRBitmap *const texture, DqnV4 color, const DTRRenderTransform transform = DTRRender_DefaultTriangleTransform());
void DTRRender_Bitmap (DTRRenderBuffer *const renderBuffer, DTRBitmap *const bitmap, DqnV2 pos, const DTRRenderTransform transform = DTRRender_DefaultTransform(), DqnV4 color = DqnV4_4f(1, 1, 1, 1)); void DTRRender_Bitmap (DTRRenderBuffer *const renderBuffer, DTRBitmap *const bitmap, DqnV2 pos, const DTRRenderTransform transform = DTRRender_DefaultTransform(), DqnV4 color = DqnV4_4f(1, 1, 1, 1));
void DTRRender_Clear (DTRRenderBuffer *const renderBuffer, DqnV3 color); void DTRRender_Clear (DTRRenderBuffer *const renderBuffer, DqnV3 color);

View File

@ -39,7 +39,7 @@ REM wd4100 unused argument parameters
REM wd4201 nonstandard extension used: nameless struct/union REM wd4201 nonstandard extension used: nameless struct/union
REM wd4189 local variable is initialised but not referenced REM wd4189 local variable is initialised but not referenced
REM wd4505 unreferenced local function not used will be removed REM wd4505 unreferenced local function not used will be removed
set CompileFlags=-EHsc -GR- -Oi -MT -Z7 -W4 -wd4100 -wd4201 -wd4189 -wd4505 -Od -FAsc /I..\src\external\ set CompileFlags=-EHsc -GR- -Oi -MT -Z7 -W4 -wd4100 -wd4201 -wd4189 -wd4505 -O2 -FAsc /I..\src\external\
set DLLFlags=/Fm%ProjectName% /Fo%ProjectName% /Fa%ProjectName% /Fe%ProjectName% set DLLFlags=/Fm%ProjectName% /Fo%ProjectName% /Fa%ProjectName% /Fe%ProjectName%
set Win32Flags=/FmWin32DTRenderer /FeWin32DTRenderer set Win32Flags=/FmWin32DTRenderer /FeWin32DTRenderer
@ -62,7 +62,7 @@ REM ////////////////////////////////////////////////////////////////////////////
del *.pdb >NUL 2>NUL del *.pdb >NUL 2>NUL
cl %CompileFlags% %Win32Flags% ..\src\Win32DTRenderer.cpp /link %LinkLibraries% %LinkFlags% cl %CompileFlags% %Win32Flags% ..\src\Win32DTRenderer.cpp /link %LinkLibraries% %LinkFlags%
REM cl %CompileFlags% %DLLFlags% ..\src\UnityBuild\UnityBuild.cpp /LD /link ..\src\external\easy\easy_profiler.lib /PDB:%ProjectName%_%TimeStamp%.pdb /export:DTR_Update %LinkFlags% REM cl %CompileFlags% %DLLFlags% ..\src\UnityBuild\UnityBuild.cpp /LD /link ..\src\external\easy\easy_profiler.lib /PDB:%ProjectName%_%TimeStamp%.pdb /export:DTR_Update %LinkFlags%
cl %CompileFlags% %DLLFlags% ..\src\UnityBuild\UnityBuild.cpp /LD /link /PDB:%ProjectName%_%TimeStamp%.pdb /export:DTR_Update %LinkFlags% cl %CompileFlags% %DLLFlags% ..\src\UnityBuild\UnityBuild.cpp /LD /link /PDB:%ProjectName%_%TimeStamp%.pdb /export:DTR_Update %LinkFlags%
popd popd
set LastError=%ERRORLEVEL% set LastError=%ERRORLEVEL%