Cogs.Core: Extensions/EchoSounder/Source/Systems/UniformGridSystem_isosurf

 
#include "Resources/Resources.h"
#include "Resources/MeshManager.h"
#include "Resources/VertexFormats.h"
 
#include "Context.h"
 
#include "../../../IsoSurfaces/MarchingCubesTables.h"
 
#include "Foundation/Logging/Logger.h"
#include "Foundation/Memory/MemoryBuffer.h"
#include "Foundation/Platform/Timer.h"
 
#include <glm/glm.hpp>
 
namespace {
  using namespace Cogs::Core;
 
  Cogs::Logging::Log logger = Cogs::Logging::getLogger("UniformGridSystem");
 
  struct Vertex
  {
    glm::vec3 pos;
    glm::vec3 nrm;
    glm::vec2 tex;
  };
  typedef uint16_t Idx;
 
  struct size3_t { size_t x, y, z; };
 
//#pragma optimize( "", off )
  void analyzeAVX2(uint8_t* tmp,
                   unsigned& vertexCount_,
                   unsigned& occupiedCells_,
                   unsigned& indexCount_,
                   const float* values,
                   const glm::uvec3 gridLayout_,
                   const size_t T_n,
                   const float* thresholds)
  {
    size3_t gridLayout = { gridLayout_.x, gridLayout_.y, gridLayout_.z };
 
    assert((gridLayout.x & 7) == 0 && (gridLayout.y & 7) == 0 && (gridLayout.z & 7) == 0);
    assert((gridLayout.y*gridLayout.x & 31) == 0);
    assert(gridLayout.x <= 256 && "x must fit inside a byte");
    size3_t cellLayout = { gridLayout.x - 1, gridLayout.y - 1, gridLayout.z - 1 };
    size3_t gridLayoutL2 = {
      static_cast<size_t>(log2(gridLayout_.x)),
      static_cast<size_t>(log2(gridLayout_.y)),
      static_cast<size_t>(log2(gridLayout_.z))
    };
    assert((size_t(1) << gridLayoutL2.x) == gridLayout.x &&
           (size_t(1) << gridLayoutL2.y) == gridLayout.y &&
           (size_t(1) << gridLayoutL2.z) == gridLayout.z);
 
    const auto * axesTable = MarchingCubes::axesTable().data();
    const auto * indexCountTable = MarchingCubes::indexCountTable().data();
    const auto * indexTable = MarchingCubes::indexTable().data();
 
    auto * vtxwork = (uint32_t*)(tmp + gridLayout.x*gridLayout.y*gridLayout.z*((sizeof(uint32_t) + sizeof(uint8_t))*T_n));
    auto * idxwork = (uint32_t*)(tmp + gridLayout.x*gridLayout.y*gridLayout.z*((sizeof(uint32_t) + sizeof(uint8_t) + 3 * sizeof(uint32_t))*T_n));
 
    unsigned vertexCount = 0;
    unsigned occupiedCells = 0;
    unsigned indexCount = 0;
    for (unsigned l = 0; l < T_n; l++) {
      auto * offsets = (uint32_t*)(tmp + gridLayout.x*gridLayout.y*gridLayout.z * sizeof(uint32_t)*l);
      auto * codes = (tmp + gridLayout.x*gridLayout.y*gridLayout.z*(sizeof(uint32_t)*T_n + l));
      auto T = _mm256_set1_ps(thresholds[l]);
      {
        auto M = _mm256_set1_epi32(1);
        auto * c = codes;
        for (unsigned k = 0; k < gridLayout.z*gridLayout.y*gridLayout.x; k += 8) {
          auto t0 = _mm256_load_ps(values + k);
          auto t1 = _mm256_and_si256(M, _mm256_castps_si256(_mm256_cmp_ps(t0, T, _CMP_LT_OQ)));
          auto t2 = _mm256_extracti128_si256(t1, 1);
          auto t3 = _mm256_extracti128_si256(t1, 0);
          auto t4 = _mm_packus_epi32(t3, t2);
          auto t5 = _mm_packus_epi16(t4, _mm_setzero_si128());
          _mm_storel_epi64((__m128i*)c, t5);
          c += 8;
        }
      }
      { // merge across slices
        auto * c = (__m256i*)codes;
        const auto M = (gridLayout.y*gridLayout.x) / sizeof(__m256i);
        for (unsigned k = 0; k < cellLayout.z; k++) {
          for (unsigned j = 0; j < M; j++) {
            auto t0 = _mm256_load_si256((__m256i*)c);
            auto t1 = _mm256_load_si256((__m256i*)(c + M));
            auto t2 = _mm256_slli_epi32(t1, 4);
            auto t3 = _mm256_or_si256(t0, t2);
            _mm256_store_si256(c++, t3);
          }
        }
        for (unsigned j = 0; j < M; j++) {
          auto t0 = _mm256_load_si256((__m256i*)c);
          auto t2 = _mm256_slli_epi32(t0, 4);
          auto t3 = _mm256_or_si256(t0, t2);
          _mm256_store_si256(c++, t3);
        }
      }
      {
        auto mask_i_epi8 = _mm256_set1_epi8(static_cast<char>(gridLayout.x - 1));
        auto mask_j_epi8 = _mm256_set1_epi8(static_cast<char>(gridLayout.y - 1));
        auto stride = _mm256_set1_epi8(32);
        auto ki = _mm256_set_epi8(31, 30, 29, 28,
                                 27, 26, 25, 24,
                                 23, 22, 21, 20,
                                 19, 18, 17, 16,
                                 15, 14, 13, 12,
                                 11, 10, 9, 8,
                                 7, 6, 5, 4,
                                 3, 2, 1, 0);
        auto kj = _mm256_set_epi8(31 >> gridLayoutL2.x, 30 >> gridLayoutL2.x, 29 >> gridLayoutL2.x, 28 >> gridLayoutL2.x,
                                  27 >> gridLayoutL2.x, 26 >> gridLayoutL2.x, 25 >> gridLayoutL2.x, 24 >> gridLayoutL2.x,
                                  23 >> gridLayoutL2.x, 22 >> gridLayoutL2.x, 21 >> gridLayoutL2.x, 20 >> gridLayoutL2.x,
                                  19 >> gridLayoutL2.x, 18 >> gridLayoutL2.x, 17 >> gridLayoutL2.x, 16 >> gridLayoutL2.x,
                                  15 >> gridLayoutL2.x, 14 >> gridLayoutL2.x, 13 >> gridLayoutL2.x, 12 >> gridLayoutL2.x,
                                  11 >> gridLayoutL2.x, 10 >> gridLayoutL2.x, 9 >> gridLayoutL2.x, 8 >> gridLayoutL2.x,
                                  7 >> gridLayoutL2.x, 6 >> gridLayoutL2.x, 5 >> gridLayoutL2.x, 4 >> gridLayoutL2.x,
                                  3 >> gridLayoutL2.x, 2 >> gridLayoutL2.x, 1 >> gridLayoutL2.x, 0 >> gridLayoutL2.x);
        auto ones = _mm256_set1_epi32(~0);
 
        //const auto mask_i = (1 << gridLayoutL2.x) - 1;
        //const auto mask_j = ((1 << gridLayoutL2.y) - 1) << gridLayoutL2.x;
        //const auto mask_k = ((1 << gridLayoutL2.z) - 1) << (gridLayoutL2.x + gridLayoutL2.y);
 
        for (unsigned i = 0; i < gridLayout.z*gridLayout.y*gridLayout.x; i += 32) {
          auto skirt_i = _mm256_cmpeq_epi8(_mm256_and_si256(ki, mask_i_epi8), mask_i_epi8);
          ki = _mm256_add_epi8(ki, stride);
 
          // corner 0,0,0 and corner 1,0,0
          auto t0 = _mm256_load_si256((__m256i*)(codes + i));
          auto t1 = _mm256_loadu_si256((__m256i*)(codes + i + 1));
          auto t2 = _mm256_blendv_epi8(t1, t0, skirt_i);
          auto t3 = _mm256_or_si256(t0, _mm256_slli_epi32(t2, 1));
 
          auto row = _mm256_add_epi8(_mm256_set1_epi8(static_cast<char>(i >> gridLayoutL2.x)), kj);
          auto skirt_j = _mm256_cmpeq_epi8(_mm256_and_si256(row, mask_j_epi8), mask_j_epi8);
 
          // corner 0,1,0 and corner 1,1,0
          auto r0 = _mm256_load_si256((__m256i*)(codes + gridLayout.x + i));
          auto r1 = _mm256_loadu_si256((__m256i*)(codes + gridLayout.x + i + 1));
          auto r2 = _mm256_blendv_epi8(r1, r0, skirt_i);
          auto r3 = _mm256_or_si256(r0, _mm256_slli_epi32(r2, 1));
          auto r4 = _mm256_blendv_epi8(r3, t3, skirt_j);
 
          auto t5 = _mm256_or_si256(t3, _mm256_slli_epi32(r4, 2));  // merge
 
          auto ones_ = _mm256_cmpeq_epi8(_mm256_setzero_si256(), _mm256_setzero_si256());
          auto m2 = _mm256_cmpeq_epi8(t5, ones_);
          auto code_epi8 = _mm256_andnot_si256(m2, t5);
          _mm256_store_si256((__m256i*)(codes + i), code_epi8);
 
          auto m3 = _mm256_cmpeq_epi8(code_epi8, _mm256_setzero_si256());
          uint32_t zerolanes = _mm256_movemask_epi8(m3);
          if (zerolanes == 0xffffffff) continue;
 
          auto skirt_k = _mm256_set1_epi8((i >> (gridLayoutL2.x + gridLayoutL2.y)) == ((1u << gridLayoutL2.z) - 1u) ? 255 : 0);
          auto skirt = _mm256_or_si256(_mm256_or_si256(skirt_i, skirt_j), skirt_k);
 
          for (unsigned r = 0; r < 32; r++) {
            if(m3.m256i_u8[r]==0) {
              auto code = code_epi8.m256i_u8[r];
              offsets[i + r] = vertexCount;
              auto k = ((i + r) << 8) | l;
 
              auto axes = axesTable[code];
              if (axes & 4) {
                vtxwork[vertexCount++] = k | (4 << 5);
              }
              if (axes & 2) {
                vtxwork[vertexCount++] = k | (2 << 5);
              }
              if (axes & 1) {
                vtxwork[vertexCount++] = k | (1 << 5);
              }
              if(skirt.m256i_u8[r] == 0) {
                occupiedCells++;
                auto Ni = indexCountTable[code];
                auto * as = indexTable + 16 * size_t(code);
                for (unsigned ll = 0; ll < Ni; ll+=3) {
                  idxwork[indexCount++] = k | (as[ll + 0] << 2);
                  idxwork[indexCount++] = k | (as[ll + 1] << 2);
                  idxwork[indexCount++] = k | (as[ll + 2] << 2);
                }
              }
            }
          }
        }
      }
    }
 
    if (auto t = vertexCount; t) {
      while ((t & 7)) vtxwork[t++] = vtxwork[t - 1];
    }
    if (auto t = indexCount; t) {
      while (t & 7) idxwork[t++] = idxwork[indexCount - 1];
    }
 
    vertexCount_ = vertexCount;
    occupiedCells_ = occupiedCells;
    indexCount_ = indexCount;
  }
  //#pragma optimize( "", on )
 
 
  //#pragma optimize( "", off )
  unsigned vertexExtractAVX2(Cogs::Core::MappedStream<Vertex>& vertices,
                             uint8_t* tmp,
                             const float* values,
                             const glm::uvec3 gridLayout_,
                             const unsigned T_n,
                             const float* thresholds,
                             unsigned vn)
  {
    size3_t gridLayout = { gridLayout_.x, gridLayout_.y, gridLayout_.z };
    size3_t cellLayout = { gridLayout.x - 1, gridLayout.y - 1, gridLayout.z - 1 };
 
    size3_t gridLayoutL2 = {
      static_cast<size_t>(log2(gridLayout_.x)),
      static_cast<size_t>(log2(gridLayout_.y)),
      static_cast<size_t>(log2(gridLayout_.z))
    };
    assert((size_t(1) << gridLayoutL2.x) == gridLayout.x &&
           (size_t(1) << gridLayoutL2.y) == gridLayout.y &&
           (size_t(1) << gridLayoutL2.z) == gridLayout.z);
 
 
    auto * work = (uint32_t*)(tmp + gridLayout.x*gridLayout.y*gridLayout.z*((sizeof(uint32_t) + sizeof(uint8_t))*T_n));
 
    //const auto mask_i = _mm256_set1_epi32(gridLayout.x - 1);
    const auto mask_j = _mm256_set1_epi32(static_cast<int>(gridLayout.y - 1));
    const auto mask_k = _mm256_set1_epi32(static_cast<int>(gridLayout.z - 1));
    const auto X_L2_epi64 = _mm_insert_epi64(_mm_undefined_si128(), gridLayoutL2.x, 0);
    const auto XY_L2_epi64 = _mm_insert_epi64(_mm_undefined_si128(), gridLayoutL2.x + gridLayoutL2.y, 0);
    const auto oneOverTn = _mm256_set1_ps(1.f / T_n);
 
    const auto one = _mm256_set1_epi32(1);
    const auto X = _mm256_set1_epi32(static_cast<int>(gridLayout.x));
    const auto XY = _mm256_set1_epi32(static_cast<int>(gridLayout.x*gridLayout.y));
    const auto C3 = _mm256_setr_ps(thresholds[0], thresholds[1], thresholds[2], thresholds[3],
                                   thresholds[0], thresholds[1], thresholds[2], thresholds[3]);
    for (size_t v = 0; v < vn; v += 8) {
      auto w = _mm256_loadu_si256((__m256i*)(work + v));
      auto o0 = _mm256_srli_epi32(w, 8);
      auto a = _mm256_i32gather_ps(values, o0, 4);
 
      // Extract i,j,k of sample point 0
      auto mask_i = _mm256_sub_epi32(X, one);
      auto i = _mm256_and_si256(o0, mask_i);
      auto j = _mm256_and_si256(_mm256_srl_epi32(o0, X_L2_epi64), mask_j);
      auto k = _mm256_and_si256(_mm256_srl_epi32(o0, XY_L2_epi64), mask_k);
 
      // mask set if i < gridLayout.x-1 => i+1 is safe access.
      auto i0_less = _mm256_cmpgt_epi32(mask_i, i);
      auto j0_less = _mm256_cmpgt_epi32(mask_j, j);
      auto k0_less = _mm256_cmpgt_epi32(mask_k, k);
 
      // Gather and calc discrete differences at first point
      auto ax = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o0, _mm256_and_si256(i0_less, one)), 4), a);
      auto ay = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o0, _mm256_and_si256(j0_less, X)), 4), a);
      auto az = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o0, _mm256_and_si256(k0_less, XY)), 4), a);
 
      auto di = _mm256_and_si256(_mm256_srli_epi32(w, 5), one);                            // 1 if intersection is on x-axis
      auto dj = _mm256_and_si256(_mm256_srli_epi32(w, 6), one);
      auto dk = _mm256_and_si256(_mm256_srli_epi32(w, 7), one);
 
      // All-set mask if intersection is on x-axis.
      auto mx = _mm256_cmpeq_epi32(di, one);
      auto my = _mm256_cmpeq_epi32(dj, one);
      auto mz = _mm256_cmpeq_epi32(dk, one);
 
      // FOrm index of sample point 1
      auto o1 = _mm256_add_epi32(_mm256_add_epi32(o0, di),
                                 _mm256_add_epi32(_mm256_and_si256(my, X),
                                                  _mm256_and_si256(mz, XY)));
      auto b = _mm256_i32gather_ps(values, o1, 4);
 
      // mask set if k + dk < gridLayout.z-1 => dk+1 is safe access.
      auto i1_less = _mm256_cmpgt_epi32(mask_i, _mm256_add_epi32(i, di));
      auto j1_less = _mm256_cmpgt_epi32(mask_j, _mm256_add_epi32(j, dj));
      auto k1_less = _mm256_cmpgt_epi32(mask_k, _mm256_add_epi32(k, dk));
 
      // Gather and calc discrete differences at second point
      auto bx = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o1, _mm256_and_si256(i1_less, one)), 4), b);
      auto by = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o1, _mm256_and_si256(j1_less, X)), 4), b);
      auto bz = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o1, _mm256_and_si256(k1_less, XY)), 4), b);
 
      auto l = _mm256_and_si256(w, _mm256_set1_epi32((1 << 5) - 1));
      auto T = _mm256_permutevar_ps(C3, l);
      auto t = _mm256_mul_ps(_mm256_sub_ps(T, a), _mm256_rcp_ps(_mm256_sub_ps(b, a)));
 
      // Interpolate position
      auto px = _mm256_add_ps(_mm256_cvtepi32_ps(i), _mm256_and_ps(_mm256_castsi256_ps(mx), t));
      auto py = _mm256_add_ps(_mm256_cvtepi32_ps(j), _mm256_and_ps(_mm256_castsi256_ps(my), t));
      auto pz = _mm256_add_ps(_mm256_cvtepi32_ps(k), _mm256_and_ps(_mm256_castsi256_ps(mz), t));
 
      // Form negative of interpolated gradient (-a  + t(a-b))
      auto gx = _mm256_sub_ps(_mm256_mul_ps(t, _mm256_sub_ps(ax, bx)), ax);
      auto gy = _mm256_sub_ps(_mm256_mul_ps(t, _mm256_sub_ps(ay, by)), ay);
      auto gz = _mm256_sub_ps(_mm256_mul_ps(t, _mm256_sub_ps(az, bz)), az);
 
      auto half = _mm256_set1_ps(0.5f);
      auto tp = _mm256_mul_ps(oneOverTn, _mm256_add_ps(_mm256_cvtepi32_ps(l), half));
 
      if (v + 8 <= vn) {
        //px = _mm256_setr_ps( 0,  8, 16, 24, 32, 40, 48, 55);
        //py = _mm256_setr_ps( 1,  9, 17, 25, 33, 41, 49, 56);
        //pz = _mm256_setr_ps( 2, 10, 18, 26, 34, 42, 50, 57);
        //gx = _mm256_setr_ps( 3, 11, 19, 27, 35, 43, 51, 58);
        //gy = _mm256_setr_ps( 4, 12, 20, 28, 36, 44, 52, 59);
        //gz = _mm256_setr_ps( 5, 13, 21, 29, 37, 45, 53, 60);
        //tp = _mm256_setr_ps( 6, 14, 22, 30, 38, 46, 54, 61);
 
        auto a0 = _mm256_unpacklo_ps(px, py);                           //  0  1  8  9 | 32 33 40 41
        auto a2 = _mm256_unpacklo_ps(pz, gx);                           //  2  3 10 11 | 34 35 42 43
        auto a4 = _mm256_unpacklo_ps(gy, gz);                           //  4  5 12 13 | 36 37 44 45
        auto a6 = _mm256_unpacklo_ps(tp, half);                         //  5  h 14  h | 38  h 46  h
 
        auto b0_ = _mm256_shuffle_ps(a0, a2, _MM_SHUFFLE(1, 0, 3, 2));  //  8  9  2  3 | 40 41 24 25
        auto b0 = _mm256_blend_ps(a0, b0_, 0xCC); // = 11001100         //  0  1  2  3 | 32 33 34 35
        auto b1 = _mm256_blend_ps(a2, b0_, 0x33); // = 00110011         //  8  9 10 11 | 40 41 42 43
 
        //auto b0 = _mm256_shuffle_ps(a0, a2, _MM_SHUFFLE(1, 0, 1, 0));   //  0  1  2  3 | 32 33 34 35
        //auto b1 = _mm256_shuffle_ps(a0, a2, _MM_SHUFFLE(3, 2, 3, 2));   //  8  9 10 11 | 40 41 42 43
 
        auto b4_ = _mm256_shuffle_ps(a4, a6, _MM_SHUFFLE(1, 0, 3, 2));
        auto b4 = _mm256_blend_ps(a4, b4_, 0xCC);
        auto b5 = _mm256_blend_ps(a6, b4_, 0x33);
        //auto b4 = _mm256_shuffle_ps(a4, a6, _MM_SHUFFLE(1, 0, 1, 0));   //  4  5  6  h | 36 37 38  h
        //auto b5 = _mm256_shuffle_ps(a4, a6, _MM_SHUFFLE(3, 2, 3, 2));   // 12 13 14  h | 44 45 46  h
 
        _mm256_store_ps((float*)vertices.data + 8 * (v + 0), _mm256_permute2f128_ps(b0, b4, 0x20));
        _mm256_store_ps((float*)vertices.data + 8 * (v + 4), _mm256_permute2f128_ps(b0, b4, 0x31));
 
        auto a1 = _mm256_unpackhi_ps(px, py);                           // 16 17 24 25 | 48 49 55 56
        auto a3 = _mm256_unpackhi_ps(pz, gx);                           // 18 19 26 27 | 50 51 57 58
 
        auto b2_ = _mm256_shuffle_ps(a1, a3, _MM_SHUFFLE(1, 0, 3, 2));
        auto b2 = _mm256_blend_ps(a1, b2_, 0xCC);
        auto b3 = _mm256_blend_ps(a3, b2_, 0x33);
        //auto b2 = _mm256_shuffle_ps(a1, a3, _MM_SHUFFLE(1, 0, 1, 0));   // 16 17 18 19 | 48 49 50 51
        //auto b3 = _mm256_shuffle_ps(a1, a3, _MM_SHUFFLE(3, 2, 3, 2));   // 24 25 26 27 | 55 56 57 58
 
        auto a5 = _mm256_unpackhi_ps(gy, gz);                           // 20 21 28 29 | 52 53 59 60
        auto a7 = _mm256_unpackhi_ps(tp, half);                         // 22  h 30  h | 54  h 61  h
 
        auto b6_ = _mm256_shuffle_ps(a5, a7, _MM_SHUFFLE(1, 0, 3, 2));
        auto b6 = _mm256_blend_ps(a5, b6_, 0xCC);
        auto b7 = _mm256_blend_ps(a7, b6_, 0x33);
        //auto b6 = _mm256_shuffle_ps(a5, a7, _MM_SHUFFLE(1, 0, 1, 0));   // 20 21 22  h | 52 53 54  h
        //auto b7 = _mm256_shuffle_ps(a5, a7, _MM_SHUFFLE(3, 2, 3, 2));   // 28 29 30  h | 59 60 61  h
 
 
        _mm256_store_ps((float*)vertices.data + 8 * (v + 1), _mm256_permute2f128_ps(b1, b5, 0x20));
        _mm256_store_ps((float*)vertices.data + 8 * (v + 2), _mm256_permute2f128_ps(b2, b6, 0x20));
        _mm256_store_ps((float*)vertices.data + 8 * (v + 3), _mm256_permute2f128_ps(b3, b7, 0x20));
        _mm256_store_ps((float*)vertices.data + 8 * (v + 5), _mm256_permute2f128_ps(b1, b5, 0x31));
        _mm256_store_ps((float*)vertices.data + 8 * (v + 6), _mm256_permute2f128_ps(b2, b6, 0x31));
        _mm256_store_ps((float*)vertices.data + 8 * (v + 7), _mm256_permute2f128_ps(b3, b7, 0x31));
      }
      else {
        for (size_t ll = 0; ll < 8 && v + ll < vn; ll++) {
          vertices[v + ll].pos.x = px.m256_f32[ll];
          vertices[v + ll].pos.y = py.m256_f32[ll];
          vertices[v + ll].pos.z = pz.m256_f32[ll];
          vertices[v + ll].nrm.x = gx.m256_f32[ll];
          vertices[v + ll].nrm.y = gy.m256_f32[ll];
          vertices[v + ll].nrm.z = gz.m256_f32[ll];
          vertices[v + ll].tex.x = tp.m256_f32[ll];
          vertices[v + ll].tex.y = 0.5f;
        }
      }
 
    }
    //assert(vix == vo);
    return vn;
  }
  //#pragma optimize( "", on )
 
  unsigned indexExtract(uint32_t* sub_mesh,
                        Idx* indices,
                        uint8_t* tmp,
                        const glm::uvec3 gridLayout,
                        const unsigned T_n,
                        const unsigned /*indexCount*/,
                        const unsigned vertexCount)
  {
    const auto cellLayout = gridLayout - glm::uvec3(1, 1, 1);
    const auto * axesTable = MarchingCubes::axesTable().data();
    const auto * indexTable = MarchingCubes::indexTable().data();
 
    unsigned iix = 0;
    for (unsigned l = 0; l < T_n; l++) {
      auto * offsets = (uint32_t*)(tmp + gridLayout.x*gridLayout.y*gridLayout.z * sizeof(uint32_t)*l);
      auto * codes = tmp + gridLayout.x*gridLayout.y*gridLayout.z*(sizeof(uint32_t)*T_n + l);
      auto * c = codes;
      for (unsigned k = 0; k < cellLayout.z; k++) {
        for (unsigned j = 0; j < cellLayout.y; j++) {
          for (unsigned i = 0; i < cellLayout.x; i++) {
            auto code = *c++;
            if (code == 0) continue;
 
            auto * axisShifts = indexTable + 16 * code;
            while (true)
            {
              auto axisShift = *axisShifts++;
              if (axisShift == 255) break;
 
              auto ii = i + (axisShift & 8 ? 1 : 0);
              auto jj = j + (axisShift & 16 ? 1 : 0);
              auto kk = k + (axisShift & 32 ? 1 : 0);
 
              auto shiftedCell = (kk*gridLayout.y + jj)*gridLayout.x + ii;
 
              const auto shiftedCode = codes[shiftedCell];
              const auto axes = axesTable[shiftedCode] & axisShift;
 
              const auto ix = offsets[shiftedCell] + ((axes & 4) ? 1 : 0) + ((axes & 2) ? 1 : 0);
              assert(ix < vertexCount);
 
              assert(sizeof(Idx)!=2 || ix < 0xffff);
              indices[iix++] = (Idx)ix;
            }
          }
 
          c++;
        }
        c += gridLayout.x;
      }
      sub_mesh[l+1] = iix;
    }
 
    return iix;
  }
 
//#pragma optimize( "", off)
  unsigned indexExtractAVX2(uint32_t* indices,
                            uint8_t* tmp,
                            const glm::uvec3 gridLayout_,
                            const unsigned T_n,
                            const unsigned indexCount,
                            const unsigned /*vertexCount*/)
  {
    size3_t gridLayout = { gridLayout_.x, gridLayout_.y, gridLayout_.z };
    size3_t cellLayout = { gridLayout.x - 1, gridLayout.y - 1, gridLayout.z - 1 };
    const auto * axesTable = MarchingCubes::axesTable().data();
    auto * idxwork = (uint32_t*)(tmp + gridLayout.x*gridLayout.y*gridLayout.z*((sizeof(uint32_t) + sizeof(uint8_t) + 3 * sizeof(uint32_t))*T_n));
 
    size3_t gridLayoutL2 = {
      static_cast<size_t>(log2(gridLayout_.x)),
      static_cast<size_t>(log2(gridLayout_.y)),
      static_cast<size_t>(log2(gridLayout_.z))
    };
    assert((size_t(1) << gridLayoutL2.x) == gridLayout.x &&
           (size_t(1) << gridLayoutL2.y) == gridLayout.y &&
           (size_t(1) << gridLayoutL2.z) == gridLayout.z);
 
    {
      auto * offsets = (uint32_t*)tmp;
      auto * codes = tmp + gridLayout.x*gridLayout.y*gridLayout.z*(sizeof(uint32_t)*T_n);
 
      const auto one_epi32 = _mm256_set1_epi32(1);
      const auto three_epi32 = _mm256_set1_epi32(3);
      const auto gridL2_3_epi64 =  _mm_set1_epi64x(gridLayoutL2.x + gridLayoutL2.y + gridLayoutL2.z);
      const auto X_epi32 = _mm256_set1_epi32(static_cast<int>(gridLayout.x));
      const auto XY_epi32 = _mm256_set1_epi32(static_cast<int>(gridLayout.x*gridLayout.y));
      const auto mask_255_epi32 = _mm256_set1_epi32(255);
      for (size_t iix = 0; iix < indexCount; iix += 8) {
        const auto w = _mm256_load_si256((const __m256i*)(idxwork + iix));
        const auto o0 = _mm256_srli_epi32(w, 8);
 
        // (((w >> 5) & 1), (((w >> 6) & 1) ? gridLayout.x : 0), (((w >> 7) & 1) ? gridLayout.x*gridLayout.y : 0))
        const auto shiftI = _mm256_and_si256(_mm256_srli_epi32(w, 5), one_epi32);
        const auto shiftJ = _mm256_and_si256(_mm256_cmpeq_epi32(_mm256_and_si256(_mm256_srli_epi32(w, 6), one_epi32), one_epi32), X_epi32);
        const auto shiftK = _mm256_and_si256(_mm256_cmpeq_epi32(_mm256_and_si256(_mm256_srli_epi32(w, 7), one_epi32), one_epi32), XY_epi32);
 
        const auto shiftedCell = _mm256_add_epi32(_mm256_add_epi32(o0, shiftI),
                                                  _mm256_add_epi32(shiftJ, shiftK));
 
        const auto L = _mm256_sll_epi32(_mm256_and_si256(w, three_epi32), gridL2_3_epi64);  // l << (gridLayoutL2.x + gridLayoutL2.y + gridLayoutL2.z)
        const auto I = _mm256_add_epi32(shiftedCell, L);
 
        const auto shiftedCode = _mm256_and_si256( _mm256_i32gather_epi32((const int*)codes, I, 1), mask_255_epi32);
        const auto axisShift = _mm256_srli_epi32(w, 2);
        const auto axes = _mm256_and_si256(_mm256_i32gather_epi32((const int*)axesTable, shiftedCode, 1), axisShift);
 
        const auto addJ = _mm256_and_si256(_mm256_srli_epi32(axes, 1), one_epi32);
        const auto addK = _mm256_and_si256(_mm256_srli_epi32(axes, 2), one_epi32);
        const auto offset = _mm256_add_epi32(_mm256_i32gather_epi32((const int*)offsets, I, 4),
                                             _mm256_add_epi32(addJ, addK));
 
        _mm256_store_si256((__m256i*)(indices + iix), offset);
      }
 
    }
    return indexCount;
  }
  //#pragma optimize( "", on )
 
 
}
 
namespace Cogs::Core::EchoSounder {
 
 
  //#pragma optimize( "", off )
  MeshHandle createIsoSurfacesAVX2(Context* context,
                                   uint64_t& analyze,
                                   uint64_t& vtx,
                                   uint64_t& idx,
                                   Cogs::Memory::MemoryBuffer& scratch,
                                   const float* values,
                                   const glm::uvec3 gridLayout,
                                   const glm::vec3 /*minCorner*/,
                                   const glm::vec3 /*maxCorner*/,
                                   const float *thresholds, size_t count)
  {
    assert(gridLayout.x != 0 && gridLayout.y != 0 && gridLayout.z != 0);
 
    const uint32_t T_n = (uint32_t)count;
 
    // grid layout is the layout of sample positions
    // a cell must be surrounded by samples, so cell layout is one less along each dimension
    //const auto indexCountTable = MarchingCubes::indexCountTable();
 
    //const auto * vertexCountTable = MarchingCubes::vertexCountTable().data();
 
 
    const auto cellLayout = gridLayout - glm::uvec3(1, 1, 1);
 
    scratch.resize(T_n*(sizeof(uint32_t) + sizeof(uint8_t) + 3 * sizeof(uint32_t) + 15*sizeof(uint32_t))*gridLayout.x*gridLayout.y*gridLayout.z, false);
    auto * tmp = (uint8_t*)scratch.data();
 
 
    unsigned occupiedCells = 0;
    unsigned indexCount = 0;
    unsigned vertexCount = 0;
 
    auto timer = Timer::startNew();
    analyzeAVX2(tmp, vertexCount, occupiedCells, indexCount, values, gridLayout, T_n, thresholds);
    analyze = timer.elapsedMicroseconds();
 
    if (occupiedCells == 0) {
      vtx = 0;
      idx = 0;
      return MeshHandle::NoHandle;
    }
 
    auto mesh = context->meshManager->createLocked();
    //mesh->setBounds(Cogs::Geometry::BoundingBox{ glm::vec3(0,0,0), glm::vec3(gridLayout.x, gridLayout.y, gridLayout.z) });
    mesh->setMeshFlag(MeshFlags::ClockwiseWinding);
    mesh->primitiveType = PrimitiveType::TriangleList;
 
    auto vertices = mesh->map<Vertex>(VertexDataType::Interleaved0, VertexFormats::Pos3fNorm3fTex2f, vertexCount);
 
    // Extract vertices
    timer = Timer::startNew();
    auto vix = vertexExtractAVX2(vertices, tmp, values, gridLayout, T_n, thresholds, vertexCount);
    assert(vix == vertexCount);
    vtx = timer.elapsedMicroseconds();
 
    // extract indices
    std::vector<uint32_t> sub_mesh(T_n+1, 0);
    mesh->clearIndexes();
    Idx *indices = (Idx*)mesh->mapStream(VertexDataType::Indexes, 0, indexCount, sizeof(Idx), true);
    timer = Timer::startNew();
    auto iix = indexExtract(sub_mesh.data(), indices, tmp, gridLayout, T_n, indexCount, vertexCount);
    assert(iix == indexCount);
    mesh->setMeshFlag(MeshFlags::Indexed);
    mesh->setMeshFlag(MeshFlags::IndexesChanged);
    idx = timer.elapsedMicroseconds();
 
    auto subMeshes = mesh->mapSubMeshes(T_n);
    for (unsigned l = 0; l < T_n; l++) {
      uint32_t start = sub_mesh[l];
      uint32_t size = sub_mesh[l+1]-sub_mesh[l];
      subMeshes[l] = {start, size, PrimitiveType::TriangleList};
    }
    mesh->setCount(indexCount);
    
    auto box = Geometry::makeEmptyBoundingBox<Geometry::BoundingBox>();
    for(unsigned i=0; i<vertexCount; i++){
      glm::vec3 pos = vertices[i].pos;
      box.min = glm::min(box.min, pos);
      box.max = glm::max(box.max, pos);
    }
    mesh->setBounds(box);
 
    _mm256_zeroupper();
    return mesh.getHandle();
  }
  //#pragma optimize( "", on )
 
}