Cogs.Core: Extensions/GeometryProcessing/NormalVectors.cpp Source File

#include "GeometryProcessing.h"
 
#include "Context.h"
#include "Services/TaskManager.h"
#include "Services/Features.h"
#include "Platform/Instrumentation.h"
 
#include "Foundation/Memory/MemoryBuffer.h"
#include "Foundation/Platform/Timer.h"
 
#include <algorithm>
 
#include <glm/glm.hpp>
#include <glm/gtc/type_ptr.hpp>
 
#if !defined(EMSCRIPTEN) && !defined(__APPLE__)
  #include <xmmintrin.h>
#endif
 
using namespace Cogs::Core;
 
namespace {
 
  struct Corner
  {
    uint32_t Vp, Vn;
    uint32_t triangle;
    uint32_t next;
  };
 
  template<typename T>
  inline T* alignUpwards(void* ptr, size_t alignment=64)
  {
    auto a = alignment - 1;
    return reinterpret_cast<T*>(((size_t)ptr + a)&(~a));
  }
 
#if !defined(EMSCRIPTEN) && !defined(__APPLE__)
  inline __m128 loadVec3(const float* p)
  {
#if 1
    // Load 4 floats directly, risk of reading one float outside range. SSE2.
    return  _mm_loadu_ps(p);
#else
    // Load first part as a double and then insert third element with a shuffle. SSE2.
    __m128 t0 = _mm_castpd_ps(_mm_load_sd((const double*)p));
    __m128 t1 = _mm_load_ss(p + 2);
    return _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(1, 0, 1, 0));
#endif
  }
 
  inline void storeVec3(float* p, __m128 v)
  {
    _mm_store_sd((double*)p, _mm_castps_pd(v));
    _mm_store_ss(p + 2, _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)));
  }
#endif
 
  struct ClearStridedVec3Task
  {
    float* normals;
    uint32_t normalStride;
    uint32_t ia, ib;
    std::atomic<uint64_t>* elapsed_us;
 
    ClearStridedVec3Task(float* normals,
                         uint32_t normalStride,
                         uint32_t ia, uint32_t ib,
                         std::atomic<uint64_t>* elapsed_us):
      normals(normals),
      normalStride(normalStride),
      ia(ia), ib(ib),
      elapsed_us(elapsed_us)
    {}
 
    void operator()()
    {
      CpuInstrumentationScope(SCOPE_GEOMETRY, "StrClr");
      auto timer = Cogs::Timer::startNew();
 
      for (uint32_t i = ia; i < ib; i++) {
        normals[normalStride*i + 0] = 0.0f;
        normals[normalStride*i + 1] = 0.0f;
        normals[normalStride*i + 2] = 0.0f;
      }
 
      if (elapsed_us) {
        elapsed_us->fetch_add(timer.elapsedMicroseconds());
      }
    }
  };
 
  struct TriangleNormalsTask
  {
    const uint32_t* indices;
    float* normals;
    const float* vertices;
    uint32_t vertexStride;
    uint32_t ia, ib;
    std::atomic<uint64_t>* elapsed_us;
 
    TriangleNormalsTask(const uint32_t* indices,
                        float* normals,
                        const float* vertices,
                        uint32_t vertexStride,
                        uint32_t ia, uint32_t ib,
                        std::atomic<uint64_t>* elapsed_us = nullptr):
      indices(indices),
      normals(normals),
      vertices(vertices),
      vertexStride(vertexStride),
      ia(ia), ib(ib),
      elapsed_us(elapsed_us)
    {}
 
    void operator()()
    {
      CpuInstrumentationScope(SCOPE_GEOMETRY, "TriNrm");
      auto timer = Cogs::Timer::startNew();
      for (uint32_t i = ia; i < ib; i++) {
        const auto i0 = indices[3*i + 0];
        const auto i1 = indices[3*i + 1];
        const auto i2 = indices[3*i + 2];
        const auto & a = glm::make_vec3(vertices + vertexStride*i0);
        const auto & b = glm::make_vec3(vertices + vertexStride*i1);
        const auto & c = glm::make_vec3(vertices + vertexStride*i2);
        const auto n = glm::cross(c - a, b - a);
 
        *reinterpret_cast<glm::vec3*>(normals + 4 * i) = n;
      }
      if (elapsed_us) {
        elapsed_us->fetch_add(timer.elapsedMicroseconds());
      }
    }
  };
 
#if !defined(EMSCRIPTEN) && !defined(__APPLE__)
  struct TriangleNormalsTaskSSE2a
  {
    const uint32_t* indices;
    float* triangleNormals;
    const float* vertices;
    uint32_t vertexStride;
    uint32_t ia, ib;
    std::atomic<uint64_t>* elapsed_us;
 
    TriangleNormalsTaskSSE2a(const uint32_t* indices,
                             float* triangleNormals,
                             const float* vertices,
                             uint32_t vertexStride,
                             uint32_t ia, uint32_t ib,
                             std::atomic<uint64_t>* elapsed_us):
      indices(indices),
      triangleNormals(triangleNormals),
      vertices(vertices),
      vertexStride(vertexStride),
      ia(ia), ib(ib),
      elapsed_us(elapsed_us)
    {}
 
    void operator()()
    {
      CpuInstrumentationScope(SCOPE_GEOMETRY, "TriNrmSSE2");
      auto timer = Cogs::Timer::startNew();
      uint32_t i = ia;
#if 1
      auto moo = indices + 3 * ia;
      for (; i + 1 < ib; i+=2) {
        const auto ix0_a = *moo++;
        const auto ix1_a = *moo++;
        const auto ix2_a = *moo++;
 
        const auto ix0_b = *moo++;
        const auto ix1_b = *moo++;
        const auto ix2_b = *moo++;
 
        __m128 a_a = loadVec3(vertices + vertexStride*ix0_a);
        __m128 a_b = loadVec3(vertices + vertexStride*ix0_b);
        __m128 b_a = loadVec3(vertices + vertexStride*ix1_a);
        __m128 b_b = loadVec3(vertices + vertexStride*ix1_b);
        __m128 c_a = loadVec3(vertices + vertexStride*ix2_a);
        __m128 c_b = loadVec3(vertices + vertexStride*ix2_b);
 
        __m128 ca_xyz_a = _mm_sub_ps(c_a, a_a);
        __m128 ca_xyz_b = _mm_sub_ps(c_b, a_b);
        __m128 ca_yzx_a = _mm_shuffle_ps(ca_xyz_a, ca_xyz_a, _MM_SHUFFLE(3, 0, 2, 1));
        __m128 ca_yzx_b = _mm_shuffle_ps(ca_xyz_b, ca_xyz_b, _MM_SHUFFLE(3, 0, 2, 1));
 
        __m128 ba_xyz_a = _mm_sub_ps(b_a, a_a);
        __m128 ba_xyz_b = _mm_sub_ps(b_b, a_b);
        __m128 ba_yzx_a = _mm_shuffle_ps(ba_xyz_a, ba_xyz_a, _MM_SHUFFLE(3, 0, 2, 1));
        __m128 ba_yzx_b = _mm_shuffle_ps(ba_xyz_b, ba_xyz_b, _MM_SHUFFLE(3, 0, 2, 1));
 
        __m128 f_zxy_a = _mm_mul_ps(ca_xyz_a, ba_yzx_a);
        __m128 f_zxy_b = _mm_mul_ps(ca_xyz_b, ba_yzx_b);
        __m128 g_zxy_a = _mm_mul_ps(ba_xyz_a, ca_yzx_a);
        __m128 g_zxy_b = _mm_mul_ps(ba_xyz_b, ca_yzx_b);
 
        __m128 n_zxy_a = _mm_sub_ps(f_zxy_a, g_zxy_a);
        __m128 n_zxy_b = _mm_sub_ps(f_zxy_b, g_zxy_b);
        __m128 n_xyz_a = _mm_shuffle_ps(n_zxy_a, n_zxy_a, _MM_SHUFFLE(3, 0, 2, 1));
        __m128 n_xyz_b = _mm_shuffle_ps(n_zxy_b, n_zxy_b, _MM_SHUFFLE(3, 0, 2, 1));
 
        _mm_store_ps(triangleNormals + 4 * (i + 0), n_xyz_a);
        _mm_store_ps(triangleNormals + 4 * (i + 1), n_xyz_b);
      }
#endif
      for (; i < ib; i++) {
        const auto ix0 = indices[3 * i + 0];
        const auto ix1 = indices[3 * i + 1];
        const auto ix2 = indices[3 * i + 2];
 
        __m128 a = loadVec3(vertices + vertexStride*ix0);
        __m128 b = loadVec3(vertices + vertexStride*ix1);
        __m128 c = loadVec3(vertices + vertexStride*ix2);
        __m128 ca_xyz = _mm_sub_ps(c, a);
        __m128 ca_yzx = _mm_shuffle_ps(ca_xyz, ca_xyz, _MM_SHUFFLE(3, 0, 2, 1));
 
        __m128 ba_xyz = _mm_sub_ps(b, a);
        __m128 ba_yzx = _mm_shuffle_ps(ba_xyz, ba_xyz, _MM_SHUFFLE(3, 0, 2, 1));
 
        __m128 f_zxy = _mm_mul_ps(ca_xyz, ba_yzx);
        __m128 g_zxy = _mm_mul_ps(ba_xyz, ca_yzx);
 
        __m128 n_zxy = _mm_sub_ps(f_zxy, g_zxy);
        __m128 n_xyz = _mm_shuffle_ps(n_zxy, n_zxy, _MM_SHUFFLE(3, 0, 2, 1));
 
        _mm_store_ps(triangleNormals + 4 * i, n_xyz);
      }
      if (elapsed_us) {
        elapsed_us->fetch_add(timer.elapsedMicroseconds());
      }
    }
  };
 
  struct TriangleNormalsTaskSSE2b
  {
    const uint32_t* indices;
    float* triangleNormals;
    const float* vertices;
    uint32_t vertexStride;
    uint32_t ia, ib;
    std::atomic<uint64_t>* elapsed_us;
 
    TriangleNormalsTaskSSE2b(const uint32_t* indices,
                             float* triangleNormals,
                             const float* vertices,
                             uint32_t vertexStride,
                             uint32_t ia, uint32_t ib,
                             std::atomic<uint64_t>* elapsed_us = nullptr):
      indices(indices),
      triangleNormals(triangleNormals),
      vertices(vertices),
      vertexStride(vertexStride),
      ia(ia), ib(ib),
      elapsed_us(elapsed_us)
    {}
 
    void operator()()
    {
      CpuInstrumentationScope(SCOPE_GEOMETRY, "NrmAddSSE2");
      auto timer = Cogs::Timer::startNew();
 
      for (uint32_t i = ia; i < ib; i += 4) {
        uint32_t ii[4];
        uint32_t ix[4][3];
 
        for (uint32_t j = 0; j < 4; j++) {
          ii[j] = std::min(ib - 1, i + j);
          for (uint32_t k = 0; k < 3; k++) {
            ix[j][k] = indices[3 * ii[j] + k];
          }
        }
 
        __m128 a_x = _mm_loadu_ps(vertices + vertexStride*ix[0][0]);
        __m128 a_y = _mm_loadu_ps(vertices + vertexStride*ix[1][0]);
        __m128 a_z = _mm_loadu_ps(vertices + vertexStride*ix[2][0]);
        __m128 a_w = _mm_loadu_ps(vertices + vertexStride*ix[3][0]);
        _MM_TRANSPOSE4_PS(a_x, a_y, a_z, a_w);
 
        __m128 b_x = _mm_loadu_ps(vertices + vertexStride*ix[0][1]);
        __m128 b_y = _mm_loadu_ps(vertices + vertexStride*ix[1][1]);
        __m128 b_z = _mm_loadu_ps(vertices + vertexStride*ix[2][1]);
        __m128 b_w = _mm_loadu_ps(vertices + vertexStride*ix[3][1]);
        _MM_TRANSPOSE4_PS(b_x, b_y, b_z, b_w);
 
        __m128 c_x = _mm_loadu_ps(vertices + vertexStride*ix[0][2]);
        __m128 c_y = _mm_loadu_ps(vertices + vertexStride*ix[1][2]);
        __m128 c_z = _mm_loadu_ps(vertices + vertexStride*ix[2][2]);
        __m128 c_w = _mm_loadu_ps(vertices + vertexStride*ix[3][2]);
        _MM_TRANSPOSE4_PS(c_x, c_y, c_z, c_w);
 
        __m128 ba_x = _mm_sub_ps(b_x, a_x);
        __m128 ba_y = _mm_sub_ps(b_y, a_y);
        __m128 ba_z = _mm_sub_ps(b_z, a_z);
 
        __m128 ca_x = _mm_sub_ps(c_x, a_x);
        __m128 ca_y = _mm_sub_ps(c_y, a_y);
        __m128 ca_z = _mm_sub_ps(c_z, a_z);
 
        __m128 n_x = _mm_sub_ps(_mm_mul_ps(ca_y, ba_z), _mm_mul_ps(ba_y, ca_z));
        __m128 n_y = _mm_sub_ps(_mm_mul_ps(ca_z, ba_x), _mm_mul_ps(ba_z, ca_x));
        __m128 n_z = _mm_sub_ps(_mm_mul_ps(ca_x, ba_y), _mm_mul_ps(ba_x, ca_y));
        __m128 n_w = _mm_setzero_ps();
        _MM_TRANSPOSE4_PS(n_x, n_y, n_z, n_w);
 
        _mm_store_ps(triangleNormals + 4 * ii[0], n_x);
        _mm_store_ps(triangleNormals + 4 * ii[1], n_y);
        _mm_store_ps(triangleNormals + 4 * ii[2], n_z);
        _mm_store_ps(triangleNormals + 4 * ii[3], n_w);
      }
      if (elapsed_us) {
        elapsed_us->fetch_add(timer.elapsedMicroseconds());
      }
    }
  };
#endif
 
  struct VertexNormalsTask
  {
    float* vertexNormals;
    uint32_t vertexNormalStride;
    const float * triangleNormals;
    const uint32_t* inverseIndexHead;
    const uint32_t* inverseIndexNext;
    uint32_t ia, ib;
    std::atomic<uint64_t>* elapsed_us;
 
    VertexNormalsTask(float* vertexNormals,
                      uint32_t vertexNormalStride,
                      const float * triangleNormals,
                      const uint32_t* inverseIndexHead,
                      const uint32_t* inverseIndexNext,
                      uint32_t ia, uint32_t ib,
                      std::atomic<uint64_t>* elapsed_us):
      vertexNormals(vertexNormals),
      vertexNormalStride(vertexNormalStride),
      triangleNormals(triangleNormals),
      inverseIndexHead(inverseIndexHead),
      inverseIndexNext(inverseIndexNext),
      ia(ia), ib(ib),
      elapsed_us(elapsed_us)
    {}
 
    void operator()()
    {
      CpuInstrumentationScope(SCOPE_GEOMETRY, "VtxNrm");
      auto timer = Cogs::Timer::startNew();
 
      for (uint32_t i = ia; i < ib; i++) {
        glm::vec3 n;
 
        auto ix = inverseIndexHead[i];
        while (ix != ~0u) {
          n += *reinterpret_cast<const glm::vec3*>(triangleNormals + 4 * (ix/3));
          ix = inverseIndexNext[ix];
        }
        n = glm::normalize(n);
 
        *reinterpret_cast<glm::vec3*>(vertexNormals + vertexNormalStride*i) = n;
      }
      if (elapsed_us) {
        elapsed_us->fetch_add(timer.elapsedMicroseconds());
      }
    }
  };
 
#if !defined(EMSCRIPTEN) && !defined(__APPLE__)
  struct VertexNormalsTaskSSE2
  {
    float* vertexNormals;
    uint32_t vertexNormalStride;
    const float * triangleNormals;
    const uint32_t* inverseIndexHead;
    const uint32_t* inverseIndexNext;
    uint32_t ia, ib;
    std::atomic<uint64_t>* elapsed_us;
 
    VertexNormalsTaskSSE2(float* vertexNormals,
                          uint32_t vertexNormalStride,
                          const float * triangleNormals,
                          const uint32_t* inverseIndexHead,
                          const uint32_t* inverseIndexNext,
                          uint32_t ia, uint32_t ib,
                          std::atomic<uint64_t>* elapsed_us):
      vertexNormals(vertexNormals),
      vertexNormalStride(vertexNormalStride),
      triangleNormals(triangleNormals),
      inverseIndexHead(inverseIndexHead),
      inverseIndexNext(inverseIndexNext),
      ia(ia), ib(ib),
      elapsed_us(elapsed_us)
    {}
 
    void operator()()
    {
      CpuInstrumentationScope(SCOPE_GEOMETRY, "VtxNrmSSE2");
      auto timer = Cogs::Timer::startNew();
 
      for (uint32_t i = ia; i < ib; i += 4) {
 
        uint32_t ii[4];
        __m128 n[4];
        for (uint32_t l = 0; l < 4; l++) {
          auto j = std::min(ib - 1, i + l);
          ii[l] = j;
 
          __m128 m = _mm_setzero_ps();
          auto ix = inverseIndexHead[j];
          while (ix != ~0) {
            m = _mm_add_ps(m, _mm_load_ps(triangleNormals + 4 * (ix / 3)));
            ix = inverseIndexNext[ix];
          }
          n[l] = m;
        }
        __m128 x = n[0];
        __m128 y = n[1];
        __m128 z = n[2];
        __m128 w = n[3];
 
        _MM_TRANSPOSE4_PS(x, y, z, w);
        __m128 xx = _mm_mul_ps(x, x);
        __m128 yy = _mm_mul_ps(y, y);
        __m128 zz = _mm_mul_ps(z, z);
        __m128 s0 = _mm_add_ps(xx, yy);
        __m128 s1 = _mm_add_ps(s0, zz);
 
        __m128 r = _mm_rsqrt_ps(s1);
 
        storeVec3(vertexNormals + vertexNormalStride*ii[0], _mm_mul_ps(_mm_shuffle_ps(r, r, _MM_SHUFFLE(0, 0, 0, 0)), n[0]));
        storeVec3(vertexNormals + vertexNormalStride*ii[1], _mm_mul_ps(_mm_shuffle_ps(r, r, _MM_SHUFFLE(1, 1, 1, 1)), n[1]));
        storeVec3(vertexNormals + vertexNormalStride*ii[2], _mm_mul_ps(_mm_shuffle_ps(r, r, _MM_SHUFFLE(2, 2, 2, 2)), n[2]));
        storeVec3(vertexNormals + vertexNormalStride*ii[3], _mm_mul_ps(_mm_shuffle_ps(r, r, _MM_SHUFFLE(3, 3, 3, 3)), n[3]));
      }
      if (elapsed_us) {
        elapsed_us->fetch_add(timer.elapsedMicroseconds());
      }
    }
  };
#endif
}
 
void Cogs::Core::GeometryProcessing::normalsFromIndexedTriangles(Context* /*context*/,
                                                                 std::vector<glm::vec3>& N,
                                                                 std::vector<uint32_t>& remap,
                                                                 std::vector<uint32_t>& newIndices,
                                                                 const float* P,
                                                                 size_t P_stride, // byte-stride
                                                                 const uint32_t Nv,
                                                                 const uint32_t* indices,
                                                                 const uint32_t Ni,
                                                                 const float featureAngle,
                                                                 const float protrusionAngle,
                                                                 const bool flip)
{
  auto Nt = Ni / 3;
  const auto P_element_stride = uint32_t(P_stride / sizeof(float));
  Cogs::Memory::MemoryBuffer triangleNormals(4 * sizeof(float)*Nt + 64);
  auto * triangleNormalsPtr = reinterpret_cast<float*>(((size_t)triangleNormals.data() + 63)&(~63));
 
#if 0
  const uint32_t taskSize = 10000;
  // Fix: we need normalized normals here since we check angles.
 
  // Dispatch calculate per-triangle normals
  TaskId group = context->taskManager->createGroup();
  const bool useSSE = context->features->supported(CPUFeature::SSE2);
  for (uint32_t i = 0; i < Nt; i += taskSize) {
    if (useSSE) {
      tm->enqueueChild(group, TriangleNormalsTaskSSE2b(indices,
                               triangleNormalsPtr,
                               P,
                               P_element_stride,
                               i, glm::min(Nt, i + taskSize)));
    }
    else {
      tm->enqueueChild(group, TriangleNormalsTask(indices,
                            triangleNormalsPtr,
                            P,
                            P_element_stride,
                            i, glm::min(Nt, i + taskSize)));
    }
  }
#endif
  if (flip == false) {
    for (size_t i = 0; i < Nt; i++) {
      const auto & a = glm::make_vec3(P + P_element_stride * indices[3 * i + 0]);
      const auto & b = glm::make_vec3(P + P_element_stride * indices[3 * i + 1]);
      const auto & c = glm::make_vec3(P + P_element_stride * indices[3 * i + 2]);
      const auto n = glm::cross(c - a, b - a);
      *reinterpret_cast<glm::vec3*>(triangleNormalsPtr + 4 * i) = glm::normalize(n);
    }
  }
  else {
    for (size_t i = 0; i < Nt; i++) {
      const auto & a = glm::make_vec3(P + P_element_stride * indices[3 * i + 0]);
      const auto & b = glm::make_vec3(P + P_element_stride * indices[3 * i + 2]);
      const auto & c = glm::make_vec3(P + P_element_stride * indices[3 * i + 1]);
      const auto n = glm::cross(c - a, b - a);
      *reinterpret_cast<glm::vec3*>(triangleNormalsPtr + 4 * i) = glm::normalize(n);
    }
  }
 
  // Attach corners to vertices.
  std::vector<uint32_t> head(Nv, ~0u);
  std::vector<Corner> corner(Ni);
  for (uint32_t t = 0; t < Nt; t++) {
    const auto a = indices[3 * t + 0];
    const auto b = indices[3 * t + 1];
    const auto c = indices[3 * t + 2];
 
    corner[3 * t + 0].Vp = c;
    corner[3 * t + 0].Vn = b;
    corner[3 * t + 0].triangle = (t<<2) + 0;
    corner[3 * t + 0].next = head[a];
    head[a] = 3 * t + 0;
 
    corner[3 * t + 1].Vp = a;
    corner[3 * t + 1].Vn = c;
    corner[3 * t + 1].triangle = (t<<2) + 1;
    corner[3 * t + 1].next = head[b];
    head[b] = 3 * t + 1;
 
    corner[3 * t + 2].Vp = b;
    corner[3 * t + 2].Vn = a;
    corner[3 * t + 2].triangle = (t<<2) + 2;
    corner[3 * t + 2].next = head[c];
    head[c] = 3 * t + 2;
  }
 
  const float protrusionCos = glm::cos(protrusionAngle);
  const float featureCos = glm::cos(featureAngle);
 
  std::vector<Corner> vertexCorners;
  std::vector<Corner> pie;
 
  N.clear();
  remap.clear();
 
  newIndices.clear();
  newIndices.resize(Ni);
 
  // Note: can be done in parallel tasks.
  for (uint32_t v = 0; v < Nv; v++) {
    glm::vec3 n_avg;
 
    vertexCorners.clear();
    for (auto n = head[v]; n != ~0u; n = corner[n].next) {
      vertexCorners.push_back(corner[n]);
      n_avg += glm::make_vec3(triangleNormalsPtr + 4 * (vertexCorners.back().triangle >> 2));
    }
    n_avg = glm::normalize(n_avg);
 
    bool first = true;
    float minProtrusion = 1.f;
    while (!vertexCorners.empty()) {
 
      pie.clear();
      pie.push_back(vertexCorners.back());
      vertexCorners.pop_back();
 
    redo1:
      // Grow forwards
      for (size_t l = 0; l < vertexCorners.size(); l++) {
        if (pie.back().Vn == vertexCorners[l].Vp) {
 
          const auto q = glm::make_vec3(triangleNormalsPtr + 4 * (vertexCorners[l].triangle >> 2));
          if (featureCos <= glm::dot(glm::make_vec3(triangleNormalsPtr + 4 * (pie.back().triangle >> 2)), q))
          {
            minProtrusion = glm::min(minProtrusion, glm::dot(n_avg, q));
            pie.push_back(vertexCorners[l]);
            vertexCorners[l] = vertexCorners.back();
            vertexCorners.pop_back();
            goto redo1;
          }
        }
      }
 
      if (!vertexCorners.empty()) {
        // Note: From here on the pie isn't fully ordered, instead of reversing everything
        // we just swap the front and back and start growing backwards from the back.
        std::swap(pie.front(), pie.back());
      redo2:
        // Grow backwards
        for (size_t l = 0; l < vertexCorners.size(); l++) {
          if (pie.back().Vp == vertexCorners[l].Vn) {
 
            const auto q = glm::make_vec3(triangleNormalsPtr + 4 * (vertexCorners[l].triangle >> 2));
            if (featureCos <= glm::dot(glm::make_vec3(triangleNormalsPtr + 4 * (pie.back().triangle>>2)), q))
            {
              minProtrusion = glm::min(minProtrusion, glm::dot(n_avg, q));
              pie.push_back(vertexCorners[l]);
              vertexCorners[l] = vertexCorners.back();
              vertexCorners.pop_back();
              goto redo2;
            }
          }
        }
 
      }
 
      if (first && vertexCorners.empty() && minProtrusion < protrusionCos) {
        // Don't smooth at protrusion points.
        for (auto & slice : pie) {
          auto ix = uint32_t(N.size());
          newIndices[3 * (slice.triangle >> 2) + (slice.triangle & 3)] = ix;
          remap.push_back(v);
          N.push_back(glm::make_vec3(triangleNormalsPtr + 4 * (slice.triangle >> 2)));
        }
      }
      else {
        auto ix = uint32_t(N.size());
        glm::vec3 n;
        for (auto & slice : pie) {
          n += glm::make_vec3(triangleNormalsPtr + 4 * (slice.triangle >> 2));
          newIndices[3 * (slice.triangle >> 2) + (slice.triangle & 3)] = ix;
        }
        remap.push_back(v);
        N.push_back(glm::normalize(n));
      }
      first = false;
    }
  }
 
}
 
void Cogs::Core::GeometryProcessing::normalsFromIndexedTriangles(Context* context,
                                                                 float* normals,
                                                                 uint32_t normalStride,
                                                                 const float* vertices,
                                                                 uint32_t vertexStride,
                                                                 const uint32_t numVertices,
                                                                 const uint32_t* indices,
                                                                 const uint32_t numIndices,
                                                                 const uint32_t taskSize_,
                                                                 std::atomic<uint64_t>* elapsed_us)
{
  auto & tm = context->taskManager;
  auto taskSize = 4 * taskSize_;
  auto numTriangles = numIndices / 3;
 
  // fixme: add fast-track for small meshes
 
  // Buffers
  Cogs::Memory::MemoryBuffer inverseHead(sizeof(uint32_t)*numVertices + 64);
  Cogs::Memory::MemoryBuffer inverseNext(sizeof(uint32_t)*numIndices + 64);
  Cogs::Memory::MemoryBuffer triangleNormals(4 * sizeof(float)*numTriangles + 64);
  std::memset(inverseHead.data(), ~0, inverseHead.size());
 
  auto * inverseHeadPtr = reinterpret_cast<uint32_t*>(((size_t)inverseHead.data() + 63)&(~63));
  auto * inverseNextPtr = reinterpret_cast<uint32_t*>(((size_t)inverseNext.data() + 63)&(~63));
 
  // Build vertex lists
  {
    TaskId group = context->taskManager->createGroup();
    for (uint32_t ia = 0; ia < numIndices; ia += taskSize) {
      auto ib = std::min(numIndices, ia + taskSize);
      tm->enqueueChild(group, [ia, ib, inverseHeadPtr, inverseNextPtr, indices]
      {
        CpuInstrumentationScope(SCOPE_GEOMETRY, "NrmInvIx");
        for (uint32_t i = ia; i < ib; i++) {
#if defined( _WIN32 )
          inverseNextPtr[i] = _InterlockedExchange((volatile long*)(inverseHeadPtr + indices[i]), i);
#else
          inverseNextPtr[i] = __atomic_exchange_n(inverseHeadPtr + indices[i], i, __ATOMIC_ACQ_REL);
#endif
        }
      });
    }
    tm->wait(group);
    tm->destroy(group);
  }
 
  auto * triangleNormalsPtr = reinterpret_cast<float*>(((size_t)triangleNormals.data() + 63)&(~63));
 
  // Calculate all triangle normals
  {
    TaskId group = context->taskManager->createGroup();
#if !defined(EMSCRIPTEN) && !defined(__APPLE__)
    if (context->features->supported(CPUFeature::SSE2)) {
      for (uint32_t i = 0; i < numTriangles; i += taskSize) {
        tm->enqueueChild(group, TriangleNormalsTaskSSE2b(indices,
                                 triangleNormalsPtr,
                                 vertices,
                                 vertexStride,
                                 i, glm::min(numTriangles, i + taskSize),
                                 elapsed_us));
      }
    }
    else
#endif
    {
      for (uint32_t i = 0; i < numTriangles; i += taskSize) {
        tm->enqueueChild(group, TriangleNormalsTask(indices,
                              triangleNormalsPtr,
                              vertices,
                              vertexStride,
                              i, glm::min(numTriangles, i + taskSize),
                              elapsed_us));
      }
    }
    tm->wait(group);
    tm->destroy(group);
  }
 
  // Aggregate triangle normals for each vertex
  {
    TaskId group = context->taskManager->createGroup();
 
#if !defined(EMSCRIPTEN) && !defined(__APPLE__)
    if (context->features->supported(CPUFeature::SSE2)) {
      for (uint32_t i = 0; i < numVertices; i += taskSize) {
        tm->enqueueChild(group, VertexNormalsTaskSSE2(normals,
                                normalStride,
                                triangleNormalsPtr,
                                inverseHeadPtr,
                                inverseNextPtr,
                                i, std::min(numVertices, i + taskSize),
                                elapsed_us));
      }
    }
    else
#endif
    {
      for (uint32_t i = 0; i < numVertices; i += taskSize) {
        tm->enqueueChild(group,
                 VertexNormalsTask(normals,
                           normalStride,
                           triangleNormalsPtr,
                           inverseHeadPtr,
                           inverseNextPtr,
                           i, std::min(numVertices, i + taskSize),
                           elapsed_us));
      }
    }
    tm->wait(group);
    tm->destroy(group);
  }
 
}