Reorganise SIMD paths to reuse abit more code

This commit is contained in:
Doyle Thai 2017-05-31 20:57:11 +10:00
parent 47d606e297
commit 6bcdb6d1fb
3 changed files with 520 additions and 312 deletions

View File

@ -1041,15 +1041,18 @@ extern "C" void DTR_Update(PlatformRenderBuffer *const platformRenderBuffer,
DTRRender_Triangle(&renderBuffer, t4[0], t4[1], t4[2], colorRed);
DTRRender_Triangle(&renderBuffer, t5[0], t5[1], t5[2], colorRed);
////////////////////////////////////////////////////////////////////////
// Draw Loaded Model
////////////////////////////////////////////////////////////////////////
const DqnV3 LIGHT = DqnV3_3i(0, 0, -1);
const f32 MODEL_SCALE = DQN_MIN(renderBuffer.width, renderBuffer.height) * 0.5f;
DTRMesh *const mesh = &state->mesh;
DqnV3 modelP = DqnV3_3f(renderBuffer.width * 0.5f, renderBuffer.height * 0.5f, 0);
if (1)
{
////////////////////////////////////////////////////////////////////////
// Draw Loaded Model
////////////////////////////////////////////////////////////////////////
const DqnV3 LIGHT = DqnV3_3i(0, 0, -1);
const f32 MODEL_SCALE = DQN_MIN(renderBuffer.width, renderBuffer.height) * 0.5f;
DTRMesh *const mesh = &state->mesh;
DqnV3 modelP = DqnV3_3f(renderBuffer.width * 0.5f, renderBuffer.height * 0.5f, 0);
DTRRender_Mesh(&renderBuffer, mesh, modelP, MODEL_SCALE, LIGHT);
DTRRender_Mesh(&renderBuffer, mesh, modelP, MODEL_SCALE, LIGHT);
}
}
// Rect drawing

View File

@ -43,6 +43,7 @@ enum DTRDebugCounter
enum DTRDebugCycleCount
{
DTRDebugCycleCount_RenderTexturedTriangle_Rasterise,
DTRDebugCycleCount_RenderTriangle_Rasterise,
DTRDebugCycleCount_Count,
};

View File

@ -61,17 +61,20 @@ inline f32 DTRRender_SRGB1ToLinearSpacef(f32 val)
inline DqnV4 DTRRender_SRGB1ToLinearSpaceV4(DqnV4 color)
{
DqnV4 result;
#if 0
result.r = DTRRender_SRGB1ToLinearSpacef(color.r);
result.g = DTRRender_SRGB1ToLinearSpacef(color.g);
result.b = DTRRender_SRGB1ToLinearSpacef(color.b);
#else
__m128 simdColor = _mm_set_ps(color.r, color.g, color.b, 0);
__m128 simdResult = _mm_mul_ps(simdColor, simdColor);
result.r = ((f32 *)&simdResult)[3];
result.g = ((f32 *)&simdResult)[2];
result.b = ((f32 *)&simdResult)[1];
#endif
if (globalDTRPlatformFlags.canUseSSE2)
{
__m128 simdColor = _mm_set_ps(color.r, color.g, color.b, 0);
__m128 simdResult = _mm_mul_ps(simdColor, simdColor);
result.r = ((f32 *)&simdResult)[3];
result.g = ((f32 *)&simdResult)[2];
result.b = ((f32 *)&simdResult)[1];
}
else
{
result.r = DTRRender_SRGB1ToLinearSpacef(color.r);
result.g = DTRRender_SRGB1ToLinearSpacef(color.g);
result.b = DTRRender_SRGB1ToLinearSpacef(color.b);
}
result.a = color.a;
return result;
@ -88,17 +91,20 @@ inline f32 DTRRender_LinearToSRGB1Spacef(f32 val)
inline DqnV4 DTRRender_LinearToSRGB1SpaceV4(DqnV4 color)
{
DqnV4 result;
#if 0
result.r = DTRRender_LinearToSRGB1Spacef(color.r);
result.g = DTRRender_LinearToSRGB1Spacef(color.g);
result.b = DTRRender_LinearToSRGB1Spacef(color.b);
#else
__m128 simdColor = _mm_set_ps(color.r, color.g, color.b, 0);
__m128 simdResult = _mm_sqrt_ps(simdColor);
result.r = ((f32 *)&simdResult)[3];
result.g = ((f32 *)&simdResult)[2];
result.b = ((f32 *)&simdResult)[1];
#endif
if (globalDTRPlatformFlags.canUseSSE2)
{
__m128 simdColor = _mm_set_ps(color.r, color.g, color.b, 0);
__m128 simdResult = _mm_sqrt_ps(simdColor);
result.r = ((f32 *)&simdResult)[3];
result.g = ((f32 *)&simdResult)[2];
result.b = ((f32 *)&simdResult)[1];
}
else
{
result.r = DTRRender_LinearToSRGB1Spacef(color.r);
result.g = DTRRender_LinearToSRGB1Spacef(color.g);
result.b = DTRRender_LinearToSRGB1Spacef(color.b);
}
result.a = color.a;
return result;
@ -554,6 +560,221 @@ FILE_SCOPE void DebugBarycentricInternal(DqnV2 p, DqnV2 a, DqnV2 b, DqnV2 c, f32
*u = 1.0f - *v - *w;
}
typedef struct TriangleInclusionTest
{
DqnV2i boundsMin;
DqnV2i boundsMax;
f32 signedAreaP1;
f32 signedAreaP1DeltaX;
f32 signedAreaP1DeltaY;
f32 signedAreaP2;
f32 signedAreaP2DeltaX;
f32 signedAreaP2DeltaY;
f32 signedAreaP3;
f32 signedAreaP3DeltaX;
f32 signedAreaP3DeltaY;
f32 invSignedAreaParallelogram;
} TriangleInclusionTest;
typedef struct SIMDTriangleInclusionTest
{
DqnV2i boundsMin;
DqnV2i boundsMax;
__m128 vertexZValues;
__m128 signedAreaPixelDeltaX;
__m128 signedAreaPixelDeltaY;
__m128 invSignedAreaParallelogram_4x;
__m128 startPixel;
} SIMDTriangleInclusionTest;
FILE_SCOPE TriangleInclusionTest CreateTriangleInclusionTest(const i32 clipWidth,
const i32 clipHeight, DqnV3 p1,
DqnV3 p2, DqnV3 p3)
{
f32 area2Times = ((p2.x - p1.x) * (p2.y + p1.y)) + ((p3.x - p2.x) * (p3.y + p2.y)) +
((p1.x - p3.x) * (p1.y + p3.y));
if (area2Times > 0)
{
// Clockwise swap any point to make it clockwise
DQN_SWAP(DqnV3, p2, p3);
}
TriangleInclusionTest result = {};
result.boundsMin = DqnV2i_2f(DQN_MIN(DQN_MIN(p1.x, p2.x), p3.x), DQN_MIN(DQN_MIN(p1.y, p2.y), p3.y));
result.boundsMax = DqnV2i_2f(DQN_MAX(DQN_MAX(p1.x, p2.x), p3.x), DQN_MAX(DQN_MAX(p1.y, p2.y), p3.y));
result.boundsMin.x = DQN_MAX(result.boundsMin.x, 0);
result.boundsMin.y = DQN_MAX(result.boundsMin.y, 0);
result.boundsMax.x = DQN_MIN(result.boundsMax.x, clipWidth);
result.boundsMax.y = DQN_MIN(result.boundsMax.y, clipHeight);
const DqnV3 a = p1;
const DqnV3 b = p2;
const DqnV3 c = p3;
/*
/////////////////////////////////////////////////////////////////////////
// Rearranging the Determinant
/////////////////////////////////////////////////////////////////////////
Given two points that form a line and an extra point to test, we can
determine whether a point lies on the line, or is to the left or right of
a the line.
We can do this using the PerpDotProduct conceptually known as the cross
product in 2D. This can be expressed using the determinant and is the
method we are using.
First forming a 3x3 matrix of our terms with a, b being from the triangle
and test point c, we can derive a 2x2 matrix by subtracting the 1st
column from the 2nd and 1st column from the third.
| ax bx cx | | (bx - ax) (cx - ax) |
m = | ay by cy | ==> | (by - ay) (cy - ay) |
| 1 1 1 |
From our 2x2 representation we can calculate the determinant which gives
us the signed area of the triangle extended into a parallelogram.
det(m) = (bx - ax)(cy - ay) - (by - ay)(cx - ax)
Depending on the order of the vertices supplied, if it's
- CCW and c(x,y) is outside the line (triangle), the signed area is negative
- CCW and c(x,y) is inside the line (triangle), the signed area is positive
- CW and c(x,y) is outside the line (triangle), the signed area is positive
- CW and c(x,y) is inside the line (triangle), the signed area is negative
/////////////////////////////////////////////////////////////////////////
// Optimising the Determinant Calculation
/////////////////////////////////////////////////////////////////////////
The det(m) can be rearranged if expanded to be
SignedArea(cx, cy) = (ay - by)cx + (bx - ay)cy + (ax*by - ay*bx)
When we scan to fill our triangle we go pixel by pixel, left to right,
bottom to top, notice that this translates to +1 for x and +1 for y, i.e.
The first pixel's signed area is cx, then cx+1, cx+2 .. etc
SignedArea(cx, cy) = (ay - by)cx + (bx - ax)cy + (ax*by - ay*bx)
SignedArea(cx+1, cy) = (ay - by)cx+1 + (bx - ax)cy + (ax*by - ay*bx)
Then
SignedArea(cx+1, cy) - SignedArea(cx, cy) =
(ay - by)cx+1 + (bx - ax)cy + (ax*by - ay*bx)
- (ay - by)cx + (bx - ax)cy + (ax*by - ay*bx)
= (ay - by)cx+1 - (ay - by)cx
= (ay - by)(cx+1 - cx)
= (ay - by)(1) = (ay - by)
Similarly when progressing in y
SignedArea(cx, cy) = (ay - by)cx + (bx - ay)cy + (ax*by - ay*bx)
SignedArea(cx, cy+1) = (ay - by)cx + (bx - ay)cy+1 + (ax*by - ay*bx)
Then
SignedArea(cx, cy+1) - SignedArea(cx, cy) =
(ay - by)cx + (bx - ax)cy+1 + (ax*by - ay*bx)
- (ay - by)cx + (bx - ax)cy + (ax*by - ay*bx)
= (bx - ax)cy+1 - (bx - ax)cy
= (bx - ax)(cy+1 - cy)
= (bx - ax)(1) = (bx - ax)
Then we can see that when we progress along x, we only need to change by
the value of SignedArea by (ay - by) and similarly for y, (bx - ax)
/////////////////////////////////////////////////////////////////////////
// Barycentric Coordinates
/////////////////////////////////////////////////////////////////////////
At this point we have an equation that can be used to calculate the
2x the signed area of a triangle, or the signed area of a parallelogram,
the two of which are equivalent.
det(m) = (bx - ax)(cy - ay) - (by - ay)(cx - ax)
SignedArea(cx, cy) = (ay - by)cx + (bx - ay)cy + (ax*by - ay*bx)
A barycentric coordinate is some coefficient on A, B, C that allows us to
specify an arbitrary point in the triangle as a linear combination of the
three usually with some coefficient [0, 1].
The SignedArea turns out to be actually the barycentric coord for c(x, y)
normalised to the sum of the parallelogram area. For example a triangle
with points, A, B, C and an arbitrary point P inside the triangle. Then
SignedArea(P) with vertex A and B = Barycentric Coordinate for C
SignedArea(P) with vertex B and C = Barycentric Coordinate for A
SignedArea(P) with vertex C and A = Barycentric Coordinate for B
B
/ \
/ \
/ P \
/_______\
A C
This is normalised to the area's sum, but we can trivially turn this into
a normalised version by dividing the area of the parallelogram, i.e.
BaryCentricC(P) = (SignedArea(P) with vertex A and B)/SignedArea(with the orig triangle vertex)
BaryCentricA(P) = (SignedArea(P) with vertex B and C)/SignedArea(with the orig triangle vertex)
BaryCentricB(P) = (SignedArea(P) with vertex C and A)/SignedArea(with the orig triangle vertex)
*/
DqnV2i startP = result.boundsMin;
// signed area for a, where P1 = A, P2 = B, P3 = C
result.signedAreaP1 = ((p3.x - p2.x) * (startP.y - p2.y)) - ((p3.y - p2.y) * (startP.x - p2.x));
result.signedAreaP1DeltaX = p2.y - p3.y;
result.signedAreaP1DeltaY = p3.x - p2.x;
// signed area for b
result.signedAreaP2 = ((p1.x - p3.x) * (startP.y - p3.y)) - ((p1.y - p3.y) * (startP.x - p3.x));
result.signedAreaP2DeltaX = p3.y - p1.y;
result.signedAreaP2DeltaY = p1.x - p3.x;
// signed area for c
result.signedAreaP3 = ((p2.x - p1.x) * (startP.y - p1.y)) - ((p2.y - p1.y) * (startP.x - p1.x));
result.signedAreaP3DeltaX = p1.y - p2.y;
result.signedAreaP3DeltaY = p2.x - p1.x;
f32 signedAreaParallelogram = result.signedAreaP1 + result.signedAreaP2 + result.signedAreaP3;
if (signedAreaParallelogram == 0)
result.invSignedAreaParallelogram = 0;
else
result.invSignedAreaParallelogram = (1.0f / signedAreaParallelogram);
return result;
}
FILE_SCOPE SIMDTriangleInclusionTest CreateSimdTriangleInclusionTest(
const DqnV3 p1, const DqnV3 p2, const DqnV3 p3, const TriangleInclusionTest inclusionTest)
{
SIMDTriangleInclusionTest result = {};
result.boundsMin = inclusionTest.boundsMin;
result.boundsMax = inclusionTest.boundsMax;
// NOTE: Order is important here!
result.vertexZValues = _mm_set_ps(0, p3.z, p2.z, p1.z);
result.signedAreaPixelDeltaX = _mm_set_ps(0,
inclusionTest.signedAreaP3DeltaX,
inclusionTest.signedAreaP2DeltaX,
inclusionTest.signedAreaP1DeltaX);
result.signedAreaPixelDeltaY = _mm_set_ps(0,
inclusionTest.signedAreaP3DeltaY,
inclusionTest.signedAreaP2DeltaY,
inclusionTest.signedAreaP1DeltaY);
result.invSignedAreaParallelogram_4x = _mm_set_ps1(inclusionTest.invSignedAreaParallelogram);
result.startPixel = _mm_set_ps(0, inclusionTest.signedAreaP3,
inclusionTest.signedAreaP2,
inclusionTest.signedAreaP1);
return result;
}
inline void RasteriseTexturedTriangle(DTRRenderBuffer *const renderBuffer, const DqnV3 p1,
const DqnV3 p2, const DqnV3 p3, const DqnV2 uv1,
const DqnV2 uv2, const DqnV2 uv3, DTRBitmap *const texture,
@ -739,153 +960,150 @@ inline void SIMDRasteriseTexturedTriangle(DTRRenderBuffer *const renderBuffer, c
const __m128 INV255_4X = _mm_set_ps1(1.0f / 255.0f);
const __m128 ZERO_4X = _mm_set_ps1(0.0f);
const __m128 TWO_4X = _mm_set_ps1(2.0f);
const u32 IS_GREATER_MASK = 0xF;
// NOTE: Step size of 2 pixels across X
signedAreaPixelDeltaX = _mm_mul_ps(signedAreaPixelDeltaX, TWO_4X);
const u32 NUM_X_PIXELS_TO_SIMD = 2;
const u32 NUM_Y_PIXELS_TO_SIMD = 1;
const __m128 STEP_X_4X = _mm_set_ps1((f32)NUM_X_PIXELS_TO_SIMD);
const __m128 STEP_Y_4X = _mm_set_ps1((f32)NUM_Y_PIXELS_TO_SIMD);
// NOTE: Increase step size to the number of pixels rasterised with SIMD
signedAreaPixelDeltaX = _mm_mul_ps(signedAreaPixelDeltaX, STEP_X_4X);
signedAreaPixelDeltaY = _mm_mul_ps(signedAreaPixelDeltaY, STEP_Y_4X);
const DqnV2 uv2SubUv1 = uv2 - uv1;
const DqnV2 uv3SubUv1 = uv3 - uv1;
const __m128 colorModulate = _mm_set_ps(color.a, color.b, color.g, color.r);
const u32 texturePitch = texture->bytesPerPixel * texture->dim.w;
const u8 *const texturePtr = texture->memory;
for (i32 bufferY = min.y; bufferY < max.y; bufferY++)
for (i32 bufferY = min.y; bufferY < max.y; bufferY += NUM_Y_PIXELS_TO_SIMD)
{
__m128 signedArea1 = signedAreaPixel1;
__m128 signedArea2 = signedAreaPixel2;
#define PROCESS_COLOR_NO_SIMD 0
for (i32 bufferX = min.x; bufferX < max.x; bufferX += 2)
for (i32 bufferX = min.x; bufferX < max.x; bufferX += NUM_X_PIXELS_TO_SIMD)
{
__m128 isGreater1 = _mm_cmpge_ps(signedArea1, ZERO_4X);
i32 isGreaterResult1 = _mm_movemask_ps(isGreater1);
if ((isGreaterResult1 & IS_GREATER_MASK) == IS_GREATER_MASK)
// Rasterise buffer(X, Y) pixel
{
__m128 barycentric = _mm_mul_ps(signedArea1, invSignedAreaParallelogram_4x);
__m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric);
__m128 checkArea = signedArea1;
__m128 isGreater = _mm_cmpge_ps(checkArea, ZERO_4X);
i32 isGreaterResult = _mm_movemask_ps(isGreater);
i32 posX = bufferX;
i32 posY = bufferY;
i32 zBufferIndex = bufferX + (bufferY * zBufferPitch);
f32 pixelZValue = ((f32 *)&barycentricZ)[0] +
((f32 *)&barycentricZ)[1] +
((f32 *)&barycentricZ)[2];
f32 currZValue = renderBuffer->zBuffer[zBufferIndex];
if (pixelZValue > currZValue)
if ((isGreaterResult & IS_GREATER_MASK) == IS_GREATER_MASK && posX < max.x)
{
renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
u8 *texturePtr = texture->memory;
const u32 texturePitch = texture->bytesPerPixel * texture->dim.w;
__m128 barycentric = _mm_mul_ps(checkArea, invSignedAreaParallelogram_4x);
__m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric);
f32 barycentricB = ((f32 *)&barycentric)[2];
f32 barycentricC = ((f32 *)&barycentric)[0];
DqnV2 uv = uv1 + (uv2SubUv1 * barycentricB) + (uv3SubUv1 * barycentricC);
i32 zBufferIndex = posX + (posY * zBufferPitch);
f32 pixelZValue = ((f32 *)&barycentricZ)[0] +
((f32 *)&barycentricZ)[1] +
((f32 *)&barycentricZ)[2];
f32 currZValue = renderBuffer->zBuffer[zBufferIndex];
if (pixelZValue > currZValue)
{
renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
const f32 EPSILON = 0.1f;
DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON);
DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON);
uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f);
uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f);
f32 barycentricB = ((f32 *)&barycentric)[2];
f32 barycentricC = ((f32 *)&barycentric)[0];
DqnV2 uv = uv1 + (uv2SubUv1 * barycentricB) + (uv3SubUv1 * barycentricC);
f32 texelXf = uv.x * texture->dim.w;
f32 texelYf = uv.y * texture->dim.h;
DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w);
DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h);
const f32 EPSILON = 0.1f;
DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON);
DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON);
uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f);
uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f);
i32 texelX = (i32)texelXf;
i32 texelY = (i32)texelYf;
f32 texelXf = uv.x * texture->dim.w;
f32 texelYf = uv.y * texture->dim.h;
DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w);
DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h);
u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) +
(texelY * texturePitch));
i32 texelX = (i32)texelXf;
i32 texelY = (i32)texelYf;
#if PROCESS_COLOR_NO_SIMD
DqnV4 color1;
color1.a = (f32)(texel1 >> 24);
color1.b = (f32)((texel1 >> 16) & 0xFF);
color1.g = (f32)((texel1 >> 8) & 0xFF);
color1.r = (f32)((texel1 >> 0) & 0xFF);
color1 *= DTRRENDER_INV_255;
color1 = DTRRender_SRGB1ToLinearSpaceV4(color1);
DqnV4 blend = color * color1;
#else
__m128 color1 =
_mm_set_ps((f32)(texel1 >> 24), (f32)((texel1 >> 16) & 0xFF),
(f32)((texel1 >> 8) & 0xFF), (f32)((texel1 >> 0) & 0xFF));
color1 = _mm_mul_ps(color1, INV255_4X);
color1 = _mm_mul_ps(color1, color1); // to linear space
color1 = _mm_mul_ps(color1, colorModulate);
u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) + (texelY * texturePitch));
DqnV4 blend = {};
blend.r = ((f32 *)&color1)[0];
blend.g = ((f32 *)&color1)[1];
blend.b = ((f32 *)&color1)[2];
blend.a = ((f32 *)&color1)[3];
#endif
SetPixel(renderBuffer, bufferX, bufferY, blend, ColorSpace_Linear);
__m128 color1 = _mm_set_ps((f32)(texel1 >> 24),
(f32)((texel1 >> 16) & 0xFF),
(f32)((texel1 >> 8) & 0xFF),
(f32)((texel1 >> 0) & 0xFF));
color1 = _mm_mul_ps(color1, INV255_4X);
f32 preserveAlpha = ((f32 *)&color1)[3];
color1 = _mm_mul_ps(color1, color1); // to linear space
((f32 *)&color1)[3] = preserveAlpha;
color1 = _mm_mul_ps(color1, colorModulate);
DqnV4 blend = {};
blend.r = ((f32 *)&color1)[0];
blend.g = ((f32 *)&color1)[1];
blend.b = ((f32 *)&color1)[2];
blend.a = ((f32 *)&color1)[3];
SetPixel(renderBuffer, posX, posY, blend, ColorSpace_Linear);
}
}
}
__m128 isGreater2 = _mm_cmpge_ps(signedArea2, ZERO_4X);
i32 isGreaterResult2 = _mm_movemask_ps(isGreater2);
i32 bufferX1 = bufferX + 1;
if ((isGreaterResult2 & IS_GREATER_MASK) == IS_GREATER_MASK && bufferX1 < max.x)
// Rasterise buffer(X + 1, Y) pixel
{
__m128 barycentric = _mm_mul_ps(signedArea2, invSignedAreaParallelogram_4x);
__m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric);
i32 zBufferIndex = bufferX1 + (bufferY * zBufferPitch);
f32 pixelZValue = ((f32 *)&barycentricZ)[0] + ((f32 *)&barycentricZ)[1] +
((f32 *)&barycentricZ)[2];
f32 currZValue = renderBuffer->zBuffer[zBufferIndex];
if (pixelZValue > currZValue)
__m128 checkArea = signedArea2;
__m128 isGreater = _mm_cmpge_ps(checkArea, ZERO_4X);
i32 isGreaterResult = _mm_movemask_ps(isGreater);
i32 posX = bufferX + 1;
i32 posY = bufferY;
if ((isGreaterResult & IS_GREATER_MASK) == IS_GREATER_MASK && posX < max.x)
{
renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
u8 *texturePtr = texture->memory;
const u32 texturePitch = texture->bytesPerPixel * texture->dim.w;
__m128 barycentric = _mm_mul_ps(checkArea, invSignedAreaParallelogram_4x);
__m128 barycentricZ = _mm_mul_ps(triangleZ, barycentric);
f32 barycentricB = ((f32 *)&barycentric)[2];
f32 barycentricC = ((f32 *)&barycentric)[0];
DqnV2 uv = uv1 + (uv2SubUv1 * barycentricB) + (uv3SubUv1 * barycentricC);
i32 zBufferIndex = posX + (posY * zBufferPitch);
f32 pixelZValue = ((f32 *)&barycentricZ)[0] +
((f32 *)&barycentricZ)[1] +
((f32 *)&barycentricZ)[2];
f32 currZValue = renderBuffer->zBuffer[zBufferIndex];
if (pixelZValue > currZValue)
{
renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
const f32 EPSILON = 0.1f;
DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON);
DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON);
uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f);
uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f);
f32 barycentricB = ((f32 *)&barycentric)[2];
f32 barycentricC = ((f32 *)&barycentric)[0];
DqnV2 uv = uv1 + (uv2SubUv1 * barycentricB) + (uv3SubUv1 * barycentricC);
f32 texelXf = uv.x * texture->dim.w;
f32 texelYf = uv.y * texture->dim.h;
DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w);
DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h);
const f32 EPSILON = 0.1f;
DQN_ASSERT(uv.x >= 0 && uv.x < 1.0f + EPSILON);
DQN_ASSERT(uv.y >= 0 && uv.y < 1.0f + EPSILON);
uv.x = DqnMath_Clampf(uv.x, 0.0f, 1.0f);
uv.y = DqnMath_Clampf(uv.y, 0.0f, 1.0f);
i32 texelX = (i32)texelXf;
i32 texelY = (i32)texelYf;
f32 texelXf = uv.x * texture->dim.w;
f32 texelYf = uv.y * texture->dim.h;
DQN_ASSERT(texelXf >= 0 && texelXf < texture->dim.w);
DQN_ASSERT(texelYf >= 0 && texelYf < texture->dim.h);
u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) +
(texelY * texturePitch));
i32 texelX = (i32)texelXf;
i32 texelY = (i32)texelYf;
#if PROCESS_COLOR_NO_SIMD
DqnV4 color1;
color1.a = (f32)(texel1 >> 24);
color1.b = (f32)((texel1 >> 16) & 0xFF);
color1.g = (f32)((texel1 >> 8) & 0xFF);
color1.r = (f32)((texel1 >> 0) & 0xFF);
color1 *= DTRRENDER_INV_255;
color1 = DTRRender_SRGB1ToLinearSpaceV4(color1);
DqnV4 blend = color * color1;
#else
__m128 color1 =
_mm_set_ps((f32)(texel1 >> 24), (f32)((texel1 >> 16) & 0xFF),
(f32)((texel1 >> 8) & 0xFF), (f32)((texel1 >> 0) & 0xFF));
color1 = _mm_mul_ps(color1, INV255_4X);
color1 = _mm_mul_ps(color1, color1); // to linear space
color1 = _mm_mul_ps(color1, colorModulate);
u32 texel1 = *(u32 *)(texturePtr + (texelX * texture->bytesPerPixel) + (texelY * texturePitch));
DqnV4 blend = {};
blend.r = ((f32 *)&color1)[0];
blend.g = ((f32 *)&color1)[1];
blend.b = ((f32 *)&color1)[2];
blend.a = ((f32 *)&color1)[3];
#endif
SetPixel(renderBuffer, bufferX1, bufferY, blend, ColorSpace_Linear);
__m128 color1 = _mm_set_ps((f32)(texel1 >> 24),
(f32)((texel1 >> 16) & 0xFF),
(f32)((texel1 >> 8) & 0xFF),
(f32)((texel1 >> 0) & 0xFF));
color1 = _mm_mul_ps(color1, INV255_4X);
f32 preserveAlpha = ((f32 *)&color1)[3];
color1 = _mm_mul_ps(color1, color1); // to linear space
((f32 *)&color1)[3] = preserveAlpha;
color1 = _mm_mul_ps(color1, colorModulate);
DqnV4 blend = {};
blend.r = ((f32 *)&color1)[0];
blend.g = ((f32 *)&color1)[1];
blend.b = ((f32 *)&color1)[2];
blend.a = ((f32 *)&color1)[3];
SetPixel(renderBuffer, posX, posY, blend, ColorSpace_Linear);
}
}
}
@ -931,7 +1149,7 @@ void DTRRender_TexturedTriangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, D
DQN_SWAP(DqnV3, p2, p3);
}
DTRDebug_BeginCycleCount(DTRDebugCycleCount_RenderTriangle_Rasterise);
DTRDebug_BeginCycleCount(DTRDebugCycleCount_RenderTexturedTriangle_Rasterise);
////////////////////////////////////////////////////////////////////////////
// Scan and Render
////////////////////////////////////////////////////////////////////////////
@ -943,7 +1161,7 @@ void DTRRender_TexturedTriangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, D
{
RasteriseTexturedTriangle(renderBuffer, p1, p2, p3, uv1, uv2, uv3, texture, color);
}
DTRDebug_EndCycleCount(DTRDebugCycleCount_RenderTriangle_Rasterise);
DTRDebug_EndCycleCount(DTRDebugCycleCount_RenderTexturedTriangle_Rasterise);
////////////////////////////////////////////////////////////////////////////
// Debug
@ -1075,6 +1293,35 @@ void DTRRender_Mesh(DTRRenderBuffer *const renderBuffer, DTRMesh *const mesh, co
}
}
FILE_SCOPE inline void SIMDRasteriseTriangle(DTRRenderBuffer *const renderBuffer,
const SIMDTriangleInclusionTest simdTri,
const i32 posX, const i32 posY, const DqnV4 color,
__m128 *const signedArea)
{
__m128 ZERO_4X = _mm_set_ps1(0.0f);
u32 IS_GREATER_MASK = 0xF;
const u32 zBufferPitch = renderBuffer->width;
__m128 isGreater = _mm_cmpge_ps(*signedArea, ZERO_4X);
i32 isGreaterResult = _mm_movemask_ps(isGreater);
if ((isGreaterResult & IS_GREATER_MASK) == IS_GREATER_MASK)
{
__m128 barycentric = _mm_mul_ps(*signedArea, simdTri.invSignedAreaParallelogram_4x);
__m128 barycentricZ = _mm_mul_ps(simdTri.vertexZValues, barycentric);
i32 zBufferIndex = posX + (posY * zBufferPitch);
f32 pixelZValue =
((f32 *)&barycentricZ)[0] + ((f32 *)&barycentricZ)[1] + ((f32 *)&barycentricZ)[2];
f32 currZValue = renderBuffer->zBuffer[zBufferIndex];
if (pixelZValue > currZValue)
{
renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
SetPixel(renderBuffer, posX, posY, color, ColorSpace_Linear);
}
}
*signedArea = _mm_add_ps(*signedArea, simdTri.signedAreaPixelDeltaX);
}
void DTRRender_Triangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2, DqnV3 p3,
DqnV4 color, const DTRRenderTransform transform)
{
@ -1099,189 +1346,137 @@ void DTRRender_Triangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2,
color = DTRRender_SRGB1ToLinearSpaceV4(color);
color = PreMultiplyAlpha1(color);
////////////////////////////////////////////////////////////////////////////
// Calculate Bounding Box
////////////////////////////////////////////////////////////////////////////
DqnV2i max = DqnV2i_2f(DQN_MAX(DQN_MAX(p1.x, p2.x), p3.x),
DQN_MAX(DQN_MAX(p1.y, p2.y), p3.y));
DqnV2i min = DqnV2i_2f(DQN_MIN(DQN_MIN(p1.x, p2.x), p3.x),
DQN_MIN(DQN_MIN(p1.y, p2.y), p3.y));
min.x = DQN_MAX(min.x, 0);
min.y = DQN_MAX(min.y, 0);
max.x = DQN_MIN(max.x, renderBuffer->width - 1);
max.y = DQN_MIN(max.y, renderBuffer->height - 1);
/*
/////////////////////////////////////////////////////////////////////////
// Rearranging the Determinant
/////////////////////////////////////////////////////////////////////////
Given two points that form a line and an extra point to test, we can
determine whether a point lies on the line, or is to the left or right of
a the line.
We can do this using the PerpDotProduct conceptually known as the cross
product in 2D. This can be expressed using the determinant and is the
method we are using.
First forming a 3x3 matrix of our terms with a, b being from the triangle
and test point c, we can derive a 2x2 matrix by subtracting the 1st
column from the 2nd and 1st column from the third.
| ax bx cx | | (bx - ax) (cx - ax) |
m = | ay by cy | ==> | (by - ay) (cy - ay) |
| 1 1 1 |
From our 2x2 representation we can calculate the determinant which gives
us the signed area of the triangle extended into a parallelogram.
det(m) = (bx - ax)(cy - ay) - (by - ay)(cx - ax)
Depending on the order of the vertices supplied, if it's
- CCW and c(x,y) is outside the line (triangle), the signed area is negative
- CCW and c(x,y) is inside the line (triangle), the signed area is positive
- CW and c(x,y) is outside the line (triangle), the signed area is positive
- CW and c(x,y) is inside the line (triangle), the signed area is negative
/////////////////////////////////////////////////////////////////////////
// Optimising the Determinant Calculation
/////////////////////////////////////////////////////////////////////////
The det(m) can be rearranged if expanded to be
SignedArea(cx, cy) = (ay - by)cx + (bx - ay)cy + (ax*by - ay*bx)
When we scan to fill our triangle we go pixel by pixel, left to right,
bottom to top, notice that this translates to +1 for x and +1 for y, i.e.
The first pixel's signed area is cx, then cx+1, cx+2 .. etc
SignedArea(cx, cy) = (ay - by)cx + (bx - ax)cy + (ax*by - ay*bx)
SignedArea(cx+1, cy) = (ay - by)cx+1 + (bx - ax)cy + (ax*by - ay*bx)
Then
SignedArea(cx+1, cy) - SignedArea(cx, cy) =
(ay - by)cx+1 + (bx - ax)cy + (ax*by - ay*bx)
- (ay - by)cx + (bx - ax)cy + (ax*by - ay*bx)
= (ay - by)cx+1 - (ay - by)cx
= (ay - by)(cx+1 - cx)
= (ay - by)(1) = (ay - by)
Similarly when progressing in y
SignedArea(cx, cy) = (ay - by)cx + (bx - ay)cy + (ax*by - ay*bx)
SignedArea(cx, cy+1) = (ay - by)cx + (bx - ay)cy+1 + (ax*by - ay*bx)
Then
SignedArea(cx, cy+1) - SignedArea(cx, cy) =
(ay - by)cx + (bx - ax)cy+1 + (ax*by - ay*bx)
- (ay - by)cx + (bx - ax)cy + (ax*by - ay*bx)
= (bx - ax)cy+1 - (bx - ax)cy
= (bx - ax)(cy+1 - cy)
= (bx - ax)(1) = (bx - ax)
Then we can see that when we progress along x, we only need to change by
the value of SignedArea by (ay - by) and similarly for y, (bx - ax)
/////////////////////////////////////////////////////////////////////////
// Barycentric Coordinates
/////////////////////////////////////////////////////////////////////////
At this point we have an equation that can be used to calculate the
2x the signed area of a triangle, or the signed area of a parallelogram,
the two of which are equivalent.
det(m) = (bx - ax)(cy - ay) - (by - ay)(cx - ax)
SignedArea(cx, cy) = (ay - by)cx + (bx - ay)cy + (ax*by - ay*bx)
A barycentric coordinate is some coefficient on A, B, C that allows us to
specify an arbitrary point in the triangle as a linear combination of the
three usually with some coefficient [0, 1].
The SignedArea turns out to be actually the barycentric coord for c(x, y)
normalised to the sum of the parallelogram area. For example a triangle
with points, A, B, C and an arbitrary point P inside the triangle. Then
SignedArea(P) with vertex A and B = Barycentric Coordinate for C
SignedArea(P) with vertex B and C = Barycentric Coordinate for A
SignedArea(P) with vertex C and A = Barycentric Coordinate for B
B
/ \
/ \
/ P \
/_______\
A C
This is normalised to the area's sum, but we can trivially turn this into
a normalised version by dividing the area of the parallelogram, i.e.
BaryCentricC(P) = (SignedArea(P) with vertex A and B)/SignedArea(with the orig triangle vertex)
BaryCentricA(P) = (SignedArea(P) with vertex B and C)/SignedArea(with the orig triangle vertex)
BaryCentricB(P) = (SignedArea(P) with vertex C and A)/SignedArea(with the orig triangle vertex)
*/
f32 area2Times = ((p2.x - p1.x) * (p2.y + p1.y)) +
((p3.x - p2.x) * (p3.y + p2.y)) +
((p1.x - p3.x) * (p1.y + p3.y));
if (area2Times > 0)
{
// Clockwise swap any point to make it clockwise
DQN_SWAP(DqnV3, p2, p3);
}
const DqnV3 a = p1;
const DqnV3 b = p2;
const DqnV3 c = p3;
DqnV2i startP = min;
f32 signedArea1 = ((b.x - a.x) * (startP.y - a.y)) - ((b.y - a.y) * (startP.x - a.x));
f32 signedArea1DeltaX = a.y - b.y;
f32 signedArea1DeltaY = b.x - a.x;
f32 signedArea2 = ((c.x - b.x) * (startP.y - b.y)) - ((c.y - b.y) * (startP.x - b.x));
f32 signedArea2DeltaX = b.y - c.y;
f32 signedArea2DeltaY = c.x - b.x;
f32 signedArea3 = ((a.x - c.x) * (startP.y - c.y)) - ((a.y - c.y) * (startP.x - c.x));
f32 signedArea3DeltaX = c.y - a.y;
f32 signedArea3DeltaY = a.x - c.x;
f32 signedAreaParallelogram = ((b.x - a.x) * (c.y - a.y) - (b.y - a.y) * (c.x - a.x));
if (signedAreaParallelogram == 0) return;
f32 invSignedAreaParallelogram = 1 / signedAreaParallelogram;
////////////////////////////////////////////////////////////////////////////
// Scan and Render
////////////////////////////////////////////////////////////////////////////
DTRDebug_BeginCycleCount(DTRDebugCycleCount_RenderTriangle_Rasterise);
const u32 zBufferPitch = renderBuffer->width;
for (i32 bufferY = min.y; bufferY < max.y; bufferY++)
if (globalDTRPlatformFlags.canUseSSE2)
{
f32 signedArea1Row = signedArea1;
f32 signedArea2Row = signedArea2;
f32 signedArea3Row = signedArea3;
TriangleInclusionTest inclusionTest = CreateTriangleInclusionTest(
renderBuffer->width - 1, renderBuffer->height - 1, p1, p2, p3);
if (inclusionTest.invSignedAreaParallelogram == 0) return;
for (i32 bufferX = min.x; bufferX < max.x; bufferX++)
SIMDTriangleInclusionTest simdTri =
CreateSimdTriangleInclusionTest(p1, p2, p3, inclusionTest);
__m128 INV255_4X = _mm_set_ps1(1.0f / 255.0f);
__m128 ZERO_4X = _mm_set_ps1(0.0f);
u32 IS_GREATER_MASK = 0xF;
__m128 signedAreaPixel1 = simdTri.startPixel;
__m128 signedAreaPixel2 = _mm_add_ps(signedAreaPixel1, simdTri.signedAreaPixelDeltaX);
__m128 signedAreaPixel3 = _mm_add_ps(signedAreaPixel2, simdTri.signedAreaPixelDeltaX);
__m128 signedAreaPixel4 = _mm_add_ps(signedAreaPixel3, simdTri.signedAreaPixelDeltaX);
// NOTE: Increase step size to the number of pixels rasterised with SIMD
const u32 NUM_X_PIXELS_TO_SIMD = 2;
const u32 NUM_Y_PIXELS_TO_SIMD = 1;
const __m128 STEP_X_4X = _mm_set_ps1((f32)NUM_X_PIXELS_TO_SIMD);
const __m128 STEP_Y_4X = _mm_set_ps1((f32)NUM_Y_PIXELS_TO_SIMD);
simdTri.signedAreaPixelDeltaX = _mm_mul_ps(simdTri.signedAreaPixelDeltaX, STEP_X_4X);
simdTri.signedAreaPixelDeltaY = _mm_mul_ps(simdTri.signedAreaPixelDeltaY, STEP_Y_4X);
const DqnV2i min = inclusionTest.boundsMin;
const DqnV2i max = inclusionTest.boundsMax;
for (i32 bufferY = min.y; bufferY < max.y; bufferY += NUM_Y_PIXELS_TO_SIMD)
{
if (signedArea1Row >= 0 && signedArea2Row >= 0 && signedArea3Row >= 0)
{
f32 barycentricB = signedArea3Row * invSignedAreaParallelogram;
f32 barycentricC = signedArea1Row * invSignedAreaParallelogram;
__m128 signedArea1 = signedAreaPixel1;
__m128 signedArea2 = signedAreaPixel2;
i32 zBufferIndex = bufferX + (bufferY * zBufferPitch);
f32 pixelZValue = a.z + (barycentricB * (b.z - a.z)) + (barycentricC * (c.z - a.z));
f32 currZValue = renderBuffer->zBuffer[zBufferIndex];
DQN_ASSERT(zBufferIndex < (renderBuffer->width * renderBuffer->height));
if (pixelZValue > currZValue)
for (i32 bufferX = min.x; bufferX < max.x; bufferX += NUM_X_PIXELS_TO_SIMD)
{
SIMDRasteriseTriangle(renderBuffer, simdTri, bufferX, bufferY, color, &signedArea1);
if (bufferX + 1 < max.x)
{
renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
SetPixel(renderBuffer, bufferX, bufferY, color, ColorSpace_Linear);
SIMDRasteriseTriangle(renderBuffer, simdTri, bufferX + 1, bufferY, color,
&signedArea2);
}
}
signedArea1Row += signedArea1DeltaX;
signedArea2Row += signedArea2DeltaX;
signedArea3Row += signedArea3DeltaX;
signedAreaPixel1 = _mm_add_ps(signedAreaPixel1, simdTri.signedAreaPixelDeltaY);
signedAreaPixel2 = _mm_add_ps(signedAreaPixel2, simdTri.signedAreaPixelDeltaY);
}
}
else
{
f32 area2Times = ((p2.x - p1.x) * (p2.y + p1.y)) + ((p3.x - p2.x) * (p3.y + p2.y)) +
((p1.x - p3.x) * (p1.y + p3.y));
if (area2Times > 0)
{
// Clockwise swap any point to make it clockwise
DQN_SWAP(DqnV3, p2, p3);
}
signedArea1 += signedArea1DeltaY;
signedArea2 += signedArea2DeltaY;
signedArea3 += signedArea3DeltaY;
DqnV2i max =
DqnV2i_2f(DQN_MAX(DQN_MAX(p1.x, p2.x), p3.x), DQN_MAX(DQN_MAX(p1.y, p2.y), p3.y));
DqnV2i min =
DqnV2i_2f(DQN_MIN(DQN_MIN(p1.x, p2.x), p3.x), DQN_MIN(DQN_MIN(p1.y, p2.y), p3.y));
min.x = DQN_MAX(min.x, 0);
min.y = DQN_MAX(min.y, 0);
max.x = DQN_MIN(max.x, renderBuffer->width - 1);
max.y = DQN_MIN(max.y, renderBuffer->height - 1);
const DqnV3 a = p1;
const DqnV3 b = p2;
const DqnV3 c = p3;
DqnV2i startP = min;
f32 signedArea1 = ((b.x - a.x) * (startP.y - a.y)) - ((b.y - a.y) * (startP.x - a.x));
f32 signedArea1DeltaX = a.y - b.y;
f32 signedArea1DeltaY = b.x - a.x;
f32 signedArea2 = ((c.x - b.x) * (startP.y - b.y)) - ((c.y - b.y) * (startP.x - b.x));
f32 signedArea2DeltaX = b.y - c.y;
f32 signedArea2DeltaY = c.x - b.x;
f32 signedArea3 = ((a.x - c.x) * (startP.y - c.y)) - ((a.y - c.y) * (startP.x - c.x));
f32 signedArea3DeltaX = c.y - a.y;
f32 signedArea3DeltaY = a.x - c.x;
f32 signedAreaParallelogram = ((b.x - a.x) * (c.y - a.y) - (b.y - a.y) * (c.x - a.x));
if (signedAreaParallelogram == 0) return;
f32 invSignedAreaParallelogram = 1 / signedAreaParallelogram;
for (i32 bufferY = min.y; bufferY < max.y; bufferY++)
{
f32 signedArea1Row = signedArea1;
f32 signedArea2Row = signedArea2;
f32 signedArea3Row = signedArea3;
for (i32 bufferX = min.x; bufferX < max.x; bufferX++)
{
if (signedArea1Row >= 0 && signedArea2Row >= 0 && signedArea3Row >= 0)
{
f32 barycentricB = signedArea3Row * invSignedAreaParallelogram;
f32 barycentricC = signedArea1Row * invSignedAreaParallelogram;
i32 zBufferIndex = bufferX + (bufferY * zBufferPitch);
f32 pixelZValue =
a.z + (barycentricB * (b.z - a.z)) + (barycentricC * (c.z - a.z));
f32 currZValue = renderBuffer->zBuffer[zBufferIndex];
DQN_ASSERT(zBufferIndex < (renderBuffer->width * renderBuffer->height));
if (pixelZValue > currZValue)
{
renderBuffer->zBuffer[zBufferIndex] = pixelZValue;
SetPixel(renderBuffer, bufferX, bufferY, color, ColorSpace_Linear);
}
}
signedArea1Row += signedArea1DeltaX;
signedArea2Row += signedArea2DeltaX;
signedArea3Row += signedArea3DeltaX;
}
signedArea1 += signedArea1DeltaY;
signedArea2 += signedArea2DeltaY;
signedArea3 += signedArea3DeltaY;
}
}
DTRDebug_EndCycleCount(DTRDebugCycleCount_RenderTriangle_Rasterise);
////////////////////////////////////////////////////////////////////////////
// Debug
@ -1289,6 +1484,15 @@ void DTRRender_Triangle(DTRRenderBuffer *const renderBuffer, DqnV3 p1, DqnV3 p2,
DTRDebug_CounterIncrement(DTRDebugCounter_RenderTriangle);
if (DTR_DEBUG_RENDER)
{
DqnV2i max =
DqnV2i_2f(DQN_MAX(DQN_MAX(p1.x, p2.x), p3.x), DQN_MAX(DQN_MAX(p1.y, p2.y), p3.y));
DqnV2i min =
DqnV2i_2f(DQN_MIN(DQN_MIN(p1.x, p2.x), p3.x), DQN_MIN(DQN_MIN(p1.y, p2.y), p3.y));
min.x = DQN_MAX(min.x, 0);
min.y = DQN_MAX(min.y, 0);
max.x = DQN_MIN(max.x, renderBuffer->width - 1);
max.y = DQN_MIN(max.y, renderBuffer->height - 1);
// Draw Bounding box
{
DTRRender_Line(renderBuffer, DqnV2i_2i(min.x, min.y), DqnV2i_2i(min.x, max.y), color);