if (_mm_movemask_ps(finishMsk) != 0xf)
{
Vector4 flowCounter(0, 0, 0, 0);
__m128* prayOrgs = (__m128*)&rayOrgs[i];
__m128* prayDirs = (__m128*)&rayDirs[i];
__m128 rayOrgX = *prayOrgs;
__m128 rayOrgY = *(prayOrgs + 1); prayOrgs += winWidth;
__m128 rayOrgZ = *prayOrgs;
__m128 rayOrgW = *(prayOrgs + 1);
_MM_TRANSPOSE4_PS(rayOrgX, rayOrgY, rayOrgZ, rayOrgW);
__m128 rayDirX = *prayDirs;
__m128 rayDirY = *(prayDirs + 1); prayDirs += winWidth;
__m128 rayDirZ = *prayDirs;
__m128 rayDirW = *(prayDirs + 1);
_MM_TRANSPOSE4_PS(rayDirX, rayDirY, rayDirZ, rayDirW);
__m128 rcpDirX = rcp_ps(rayDirX);
__m128 rcpDirY = rcp_ps(rayDirY);
__m128 rcpDirZ = rcp_ps(rayDirZ);
__m128 stepsX = _mm_and_ps(_mm_cmpgt_ps(rayDirX, ZERO), ONE);
__m128 stepsY = _mm_and_ps(_mm_cmpgt_ps(rayDirY, ZERO), ONE);
__m128 stepsZ = _mm_and_ps(_mm_cmpgt_ps(rayDirZ, ZERO), ONE);
__m128 signsAdvX = _mm_mul_ps(_mm_or_ps(stepsX, _mm_and_ps(_mm_cmplt_ps(rayDirX, ZERO), Neg_ONE)), VEC_EPSILON);
__m128 signsAdvY = _mm_mul_ps(_mm_or_ps(stepsY, _mm_and_ps(_mm_cmplt_ps(rayDirY, ZERO), Neg_ONE)), VEC_EPSILON);
__m128 signsAdvZ = _mm_mul_ps(_mm_or_ps(stepsZ, _mm_and_ps(_mm_cmplt_ps(rayDirZ, ZERO), Neg_ONE)), VEC_EPSILON);
__m128 texBaseX = _mm_mul_ps(rayOrgX, VEC_GRIDSZ);
__m128 texBaseY = _mm_mul_ps(rayOrgY, VEC_GRIDSZ);
__m128 texBaseZ = _mm_mul_ps(rayOrgZ, VEC_GRIDSZ);
__m128i voxInfo4, normCone4i;
fetch3DiVector(volGridStartSize, rayOrgX, rayOrgY, rayOrgZ, voxInfo4, normCone4i);
__m128 normCone4x = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(normCone4i, 8), 24));
__m128 normCone4y = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(normCone4i, 16), 24));
__m128 normCone4z = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(normCone4i, 24), 24));
__m128 normCone4w = _mm_cvtepi32_ps(_mm_srli_epi32(normCone4i, 24));
normCone4x = _mm_mul_ps(normCone4x, _mm_set1_ps(1.f / 255.0));
normCone4y = _mm_mul_ps(normCone4y, _mm_set1_ps(1.f / 255.0));
normCone4z = _mm_mul_ps(normCone4z, _mm_set1_ps(1.f / 255.0));
normCone4w = _mm_mul_ps(normCone4w, _mm_set1_ps(1.f / 510.0));
__m128 dot4 = _mm_mul_ps(rayDirX, normCone4x);
dot4 = _mm_add_ps(dot4, _mm_mul_ps(rayDirY, normCone4y));
dot4 = _mm_add_ps(dot4, _mm_mul_ps(rayDirZ, normCone4z));
__m128_ms normalConeMsk = _mm_andnot_ps(finishMsk, _mm_cmplt_ps(dot4, normCone4w));
__m128i geoInfo4 = _mm_and_si128(voxInfo4, normalConeMsk);
__m128 tMaxX = _mm_sub_ps(_mm_add_ps(floorPositive(texBaseX), stepsX), texBaseX);
__m128 tMaxY = _mm_sub_ps(_mm_add_ps(floorPositive(texBaseY), stepsY), texBaseY);
__m128 tMaxZ = _mm_sub_ps(_mm_add_ps(floorPositive(texBaseZ), stepsZ), texBaseZ);
tMaxX = _mm_mul_ps(tMaxX, rcpDirX);
tMaxY = _mm_mul_ps(tMaxY, rcpDirY);
tMaxZ = _mm_mul_ps(tMaxZ, rcpDirZ);
stepsX = _mm_mul_ps(stepsX, _mm_set1_ps(255.f));
stepsY = _mm_mul_ps(stepsY, _mm_set1_ps(255.f));
stepsZ = _mm_mul_ps(stepsZ, _mm_set1_ps(255.f));
__m128 lastHitU = ZERO;
__m128 lastHitV = ZERO;
__m128 lastHitID = ZERO;
__m128 lastHitT = _mm_min_ps(tMaxX, _mm_min_ps(tMaxY, tMaxZ));
int finished4 = _mm_movemask_ps(finishMsk);
while (finished4 != 0xf)
{
flowCounter.z++;
__m128i geoInfoW = _mm_andnot_si128(finishMsk, _mm_and_si128(geoInfo4, _mm_set1_epi32(0xff000000)));
__m128_ms primLeftMsk = _mm_cmpgt_epi32(geoInfoW, _mm_set1_epi32(0));
int anyPrims = _mm_movemask_epi8(primLeftMsk);
while (anyPrims)
{
flowCounter.x++;
geoInfoW = _mm_subs_epu8(geoInfoW, _mm_set1_epi32(0x01000000));
__m128_ms id4 = _mm_and_si128(primLeftMsk, _mm_add_epi16(geoInfo4, _mm_srli_epi32(geoInfoW, 24)));
int pointer4[4];
pointer4[0] = mesh.pointers[id4.m128i_u16[0]];
pointer4[1] = mesh.pointers[id4.m128i_u16[2]];
pointer4[2] = mesh.pointers[id4.m128i_u16[4]];
pointer4[3] = mesh.pointers[id4.m128i_u16[6]];
__m128 id4f = _mm_cvtepi32_ps(_mm_and_si128(id4, _mm_set1_epi32(0x0000FFFF)));
__m128 faceNormalX = _mm_loadu_ps(mesh.faceNormals[pointer4[0]]);
__m128 faceNormalY = _mm_loadu_ps(mesh.faceNormals[pointer4[1]]);
__m128 faceNormalZ = _mm_loadu_ps(mesh.faceNormals[pointer4[2]]);
__m128 faceNormalW = _mm_loadu_ps(mesh.faceNormals[pointer4[3]]);
_MM_TRANSPOSE4_PS(faceNormalX, faceNormalY, faceNormalZ, faceNormalW);
__m128 dotProd4 = _mm_mul_ps(faceNormalX, rayDirX);
dotProd4 = _mm_add_ps(dotProd4, _mm_mul_ps(faceNormalY, rayDirY));
dotProd4 = _mm_add_ps(dotProd4, _mm_mul_ps(faceNormalZ, rayDirZ));
__m128 dpTestMsk = _mm_and_ps(primLeftMsk, _mm_cmplt_ps(dotProd4, ZERO));
primLeftMsk = _mm_cmpgt_epi32(geoInfoW, _mm_set1_epi32(0));
anyPrims = _mm_movemask_epi8(primLeftMsk);
int dpTest4 = _mm_movemask_ps(dpTestMsk);
if (dpTest4)
{
const DoubleTriangle* quad4[4];
quad4[0] = &mesh.quads[pointer4[0]];
quad4[1] = &mesh.quads[pointer4[1]];
quad4[2] = &mesh.quads[pointer4[2]];
quad4[3] = &mesh.quads[pointer4[3]];
__m128 e1X = _mm_loadu_ps(quad4[0]->posNorm[0]);
__m128 e1Y = _mm_loadu_ps(quad4[1]->posNorm[0]);
__m128 e1Z = _mm_loadu_ps(quad4[2]->posNorm[0]);
__m128 vBX = _mm_loadu_ps(quad4[3]->posNorm[0]);
_MM_TRANSPOSE4_PS(e1X, e1Y, e1Z, vBX);
__m128 e2X = _mm_loadu_ps(quad4[0]->posNorm[1]);
__m128 e2Y = _mm_loadu_ps(quad4[1]->posNorm[1]);
__m128 e2Z = _mm_loadu_ps(quad4[2]->posNorm[1]);
__m128 vBY = _mm_loadu_ps(quad4[3]->posNorm[1]);
_MM_TRANSPOSE4_PS(e2X, e2Y, e2Z, vBY);
__m128 e3X = _mm_loadu_ps(quad4[0]->posNorm[2]);
__m128 e3Y = _mm_loadu_ps(quad4[1]->posNorm[2]);
__m128 e3Z = _mm_loadu_ps(quad4[2]->posNorm[2]);
__m128 vBZ = _mm_loadu_ps(quad4[3]->posNorm[2]);
_MM_TRANSPOSE4_PS(e3X, e3Y, e3Z, vBZ);
__m128 vaX = _mm_sub_ps(rayOrgX, vBX);
__m128 vaY = _mm_sub_ps(rayOrgY, vBY);
__m128 vaZ = _mm_sub_ps(rayOrgZ, vBZ);
__m128 qX = _mm_sub_ps(_mm_mul_ps(vaY, e2Z), _mm_mul_ps(vaZ, e2Y));
__m128 qY = _mm_sub_ps(_mm_mul_ps(vaZ, e2X), _mm_mul_ps(vaX, e2Z));
__m128 qZ = _mm_sub_ps(_mm_mul_ps(vaX, e2Y), _mm_mul_ps(vaY, e2X));
VecIntersects( vaX, vaY, vaZ,
e2X, e2Y, e2Z,
e1X, e1Y, e1Z,
qX, qY, qZ,
rayDirX, rayDirY, rayDirZ,
lastHitU, lastHitV, lastHitID, lastHitT,
id4f, dpTestMsk);
VecIntersects( vaX, vaY, vaZ,
e2X, e2Y, e2Z,
e3X, e3Y, e3Z,
qX, qY, qZ,
rayDirX, rayDirY, rayDirZ,
lastHitU, lastHitV, lastHitID, lastHitT,
_mm_or_ps(id4f, NEG_SIGN), dpTestMsk);
__m128 isHitMsk = _mm_cmpneq_ps(lastHitID, ZERO);
finishMsk = _mm_or_ps(finishMsk, isHitMsk);
finished4 = _mm_movemask_ps(finishMsk);
}
}
int foundVox4 = finished4;
__m128 foundVoxMsk = finishMsk;
while (foundVox4 != 0xf)
{
flowCounter.y++;
__m128 intersectPointX = _mm_add_ps(_mm_mul_ps(lastHitT, rayDirX), texBaseX);
__m128 intersectPointY = _mm_add_ps(_mm_mul_ps(lastHitT, rayDirY), texBaseY);
__m128 intersectPointZ = _mm_add_ps(_mm_mul_ps(lastHitT, rayDirZ), texBaseZ);
__m128 curCellX = _mm_add_ps(intersectPointX, signsAdvX);
__m128 curCellY = _mm_add_ps(intersectPointY, signsAdvY);
__m128 curCellZ = _mm_add_ps(intersectPointZ, signsAdvZ);
__m128 texCoordX = _mm_mul_ps(curCellX, _mm_set1_ps(1.f / GRID_SIZE));
__m128 texCoordY = _mm_mul_ps(curCellY, _mm_set1_ps(1.f / GRID_SIZE));
__m128 texCoordZ = _mm_mul_ps(curCellZ, _mm_set1_ps(1.f / GRID_SIZE));
__m128i voxInfo4, gridSize_Conei;
fetch3DiVector(volGridStartSize, texCoordX, texCoordY, texCoordZ, voxInfo4, gridSize_Conei);
__m128 gridStartX = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_slli_epi32(voxInfo4, 8), 24));
__m128 gridStartY = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_slli_epi32(voxInfo4, 16), 24));
__m128 gridStartZ = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_slli_epi32(voxInfo4, 24), 24));
__m128i bCont4 = _mm_srli_epi32(voxInfo4, 24);
__m128 contMsk = _mm_andnot_ps(foundVoxMsk, _mm_cmpneq_ps(_mm_castsi128_ps(bCont4), ZERO));
int bCont = _mm_movemask_ps(contMsk);
__m128 gridSize_ConeX = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(gridSize_Conei, 8), 24));
__m128 gridSize_ConeY = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(gridSize_Conei, 16), 24));
__m128 gridSize_ConeZ = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(gridSize_Conei, 24), 24));
gridSize_ConeX = _mm_mul_ps(gridSize_ConeX, _mm_set1_ps(1.f / 255.0));
gridSize_ConeY = _mm_mul_ps(gridSize_ConeY, _mm_set1_ps(1.f / 255.0));
gridSize_ConeZ = _mm_mul_ps(gridSize_ConeZ, _mm_set1_ps(1.f / 255.0));
__m128 backFoundVoxMsk = foundVoxMsk;
if (bCont)
{
__m128 dotProd4 = _mm_mul_ps(gridSize_ConeX, rayDirX);
dotProd4 = _mm_add_ps(dotProd4, _mm_mul_ps(gridSize_ConeY, rayDirY));
dotProd4 = _mm_add_ps(dotProd4, _mm_mul_ps(gridSize_ConeZ, rayDirZ));
gridSize_ConeX = select(contMsk, _mm_set1_ps(1.f / 255.0), gridSize_ConeX);
gridSize_ConeY = select(contMsk, _mm_set1_ps(1.f / 255.0), gridSize_ConeY);
gridSize_ConeZ = select(contMsk, _mm_set1_ps(1.f / 255.0), gridSize_ConeZ);
__m128 normalConeW = _mm_cvtepi32_ps(_mm_srli_epi32(gridSize_Conei, 24));
normalConeW = _mm_mul_ps(normalConeW, _mm_set1_ps(1.f / 510.0));
__m128_ms normalConeMsk = _mm_and_ps(contMsk, _mm_cmplt_ps(dotProd4, normalConeW));
geoInfo4 = select(normalConeMsk, voxInfo4, geoInfo4);
foundVoxMsk = _mm_or_ps(normalConeMsk, foundVoxMsk);
gridStartX = select(contMsk, floorPositive(curCellX), gridStartX);
gridStartY = select(contMsk, floorPositive(curCellY), gridStartY);
gridStartZ = select(contMsk, floorPositive(curCellZ), gridStartZ);
}
__m128 targetConerX = _mm_add_ps(_mm_mul_ps(gridSize_ConeX, stepsX), gridStartX);
__m128 targetConerY = _mm_add_ps(_mm_mul_ps(gridSize_ConeY, stepsY), gridStartY);
__m128 targetConerZ = _mm_add_ps(_mm_mul_ps(gridSize_ConeZ, stepsZ), gridStartZ);
__m128 tMaxX = _mm_mul_ps(_mm_sub_ps(targetConerX, texBaseX), rcpDirX);
__m128 tMaxY = _mm_mul_ps(_mm_sub_ps(targetConerY, texBaseY), rcpDirY);
__m128 tMaxZ = _mm_mul_ps(_mm_sub_ps(targetConerZ, texBaseZ), rcpDirZ);
lastHitT = select(backFoundVoxMsk, lastHitT, _mm_min_ps(_mm_min_ps(tMaxX, tMaxY), tMaxZ));
__m128 outSideMsk = _mm_cmplt_ps(curCellX, ONE);
outSideMsk = _mm_or_ps(outSideMsk, _mm_cmplt_ps(curCellY, ONE));
outSideMsk = _mm_or_ps(outSideMsk, _mm_cmplt_ps(curCellZ, ONE));
outSideMsk = _mm_or_ps(outSideMsk, _mm_cmpgt_ps(curCellX, _mm_set1_ps(GRID_SIZE-1)));
outSideMsk = _mm_or_ps(outSideMsk, _mm_cmpgt_ps(curCellY, _mm_set1_ps(GRID_SIZE-1)));
outSideMsk = _mm_or_ps(outSideMsk, _mm_cmpgt_ps(curCellZ, _mm_set1_ps(GRID_SIZE-1)));
finishMsk = _mm_or_ps(finishMsk, outSideMsk);
foundVoxMsk = _mm_or_ps(foundVoxMsk, finishMsk);
foundVox4 = _mm_movemask_ps(foundVoxMsk);
}
finished4 = _mm_movemask_ps(finishMsk);
}
}