Cogs.Core: Extensions/EchoSounder/Source/Systems/UniformGridSystem_isosurf_split

 
#include "Resources/Resources.h"
#include "Resources/MeshManager.h"
#include "Resources/VertexFormats.h"
 
#include "Context.h"
 
#include "../../../IsoSurfaces/MarchingCubesTables.h"
 
#include "Foundation/Logging/Logger.h"
#include "Foundation/Memory/MemoryBuffer.h"
#include "Foundation/Platform/Timer.h"
 
#include <glm/glm.hpp>
 
namespace {
  using namespace Cogs::Core;
 
  Cogs::Logging::Log logger = Cogs::Logging::getLogger("UniformGridSystem");
 
  struct Vertex
  {
    glm::vec3 pos;
    glm::vec3 nrm;
    glm::vec2 tex;
  };
  typedef uint16_t Idx;
  struct size3_t { size_t x, y, z; };
 
//#pragma optimize( "", off )
  void analyzeAVX2(uint8_t* tmp,
                   unsigned& vertexCount_,
                   unsigned& occupiedCells_,
                   unsigned& indexCount_,
                   const float* values,
                   size_t gridLayoutL2_x,
                   size_t gridLayoutL2_y,
                   size_t gridLayout_z,
                   const size_t T_n,
                   const float* thresholds)
  {
    const size_t gridLayout_x = size_t(1) << gridLayoutL2_x;
    const size_t gridLayout_y = size_t(1) << gridLayoutL2_y;
    //const size_t cellLayout_x = gridLayout_x - 1;
    //const size_t cellLayout_y = gridLayout_y - 1;
    const size_t cellLayout_z = gridLayout_z - 1;
 
    assert((gridLayout_x & 7) == 0 && (gridLayout_y & 7) == 0);
    assert((gridLayout_y*gridLayout_x & 31) == 0);
    assert(gridLayout_x <= 256 && "x must fit inside a byte");
 
    const auto * axesTable = MarchingCubes::axesTable().data();
    const auto * indexTable = MarchingCubes::indexTable().data();
 
    auto * vtxwork = (uint32_t*)(tmp + gridLayout_x * gridLayout_y*gridLayout_z*((sizeof(uint32_t) + sizeof(uint8_t))*T_n));
    auto * idxwork = (uint32_t*)(tmp + gridLayout_x * gridLayout_y*gridLayout_z*((sizeof(uint32_t) + sizeof(uint8_t) + 3 * sizeof(uint32_t))*T_n));
 
    unsigned vertexCount = 0;
    unsigned occupiedCells = 0;
    unsigned indexCount = 0;
    for (unsigned l = 0; l < T_n; l++) {
 
      auto * offsets = (uint32_t*)(tmp + gridLayout_x * gridLayout_y*gridLayout_z * sizeof(uint32_t)*l);
      auto * codes = (tmp + gridLayout_x * gridLayout_y*gridLayout_z*(sizeof(uint32_t)*T_n + l));
 
      auto T = _mm256_set1_ps(thresholds[l]);
      {
        auto M = _mm256_set1_epi32(1);
        auto * c = codes;
        for (unsigned k = 0; k < gridLayout_z*gridLayout_y*gridLayout_x; k += 8) {
          auto t0 = _mm256_load_ps(values + k);
          auto t1 = _mm256_and_si256(M, _mm256_castps_si256(_mm256_cmp_ps(t0, T, _CMP_LT_OQ)));
          auto t2 = _mm256_extracti128_si256(t1, 1);
          auto t3 = _mm256_extracti128_si256(t1, 0);
          auto t4 = _mm_packus_epi32(t3, t2);
          auto t5 = _mm_packus_epi16(t4, _mm_setzero_si128());
          _mm_storel_epi64((__m128i*)c, t5);
          c += 8;
        }
      }
      { // merge across slices
        auto * c = (__m256i*)codes;
        const auto M = (gridLayout_y*gridLayout_x) / sizeof(__m256i);
        for (unsigned k = 0; k < cellLayout_z; k++) {
          for (unsigned j = 0; j < M; j++) {
            auto t0 = _mm256_load_si256((__m256i*)c);
            auto t1 = _mm256_load_si256((__m256i*)(c + M));
            auto t2 = _mm256_slli_epi32(t1, 4);
            auto t3 = _mm256_or_si256(t0, t2);
            _mm256_store_si256(c++, t3);
          }
        }
        for (unsigned j = 0; j < M; j++) {
          auto t0 = _mm256_load_si256((__m256i*)c);
          auto t2 = _mm256_slli_epi32(t0, 4);
          auto t3 = _mm256_or_si256(t0, t2);
          _mm256_store_si256(c++, t3);
        }
      }
      {
        auto mask_i_epi8 = _mm256_set1_epi8(static_cast<char>(gridLayout_x - 1));
        auto mask_j_epi8 = _mm256_set1_epi8(static_cast<char>(gridLayout_y - 1));
        auto stride = _mm256_set1_epi8(32);
        auto ki = _mm256_set_epi8(31, 30, 29, 28,
                                  27, 26, 25, 24,
                                  23, 22, 21, 20,
                                  19, 18, 17, 16,
                                  15, 14, 13, 12,
                                  11, 10, 9, 8,
                                  7, 6, 5, 4,
                                  3, 2, 1, 0);
        auto kj = _mm256_set_epi8(31 >> gridLayoutL2_x, 30 >> gridLayoutL2_x, 29 >> gridLayoutL2_x, 28 >> gridLayoutL2_x,
                                  27 >> gridLayoutL2_x, 26 >> gridLayoutL2_x, 25 >> gridLayoutL2_x, 24 >> gridLayoutL2_x,
                                  23 >> gridLayoutL2_x, 22 >> gridLayoutL2_x, 21 >> gridLayoutL2_x, 20 >> gridLayoutL2_x,
                                  19 >> gridLayoutL2_x, 18 >> gridLayoutL2_x, 17 >> gridLayoutL2_x, 16 >> gridLayoutL2_x,
                                  15 >> gridLayoutL2_x, 14 >> gridLayoutL2_x, 13 >> gridLayoutL2_x, 12 >> gridLayoutL2_x,
                                  11 >> gridLayoutL2_x, 10 >> gridLayoutL2_x, 9 >> gridLayoutL2_x, 8 >> gridLayoutL2_x,
                                  7 >> gridLayoutL2_x, 6 >> gridLayoutL2_x, 5 >> gridLayoutL2_x, 4 >> gridLayoutL2_x,
                                  3 >> gridLayoutL2_x, 2 >> gridLayoutL2_x, 1 >> gridLayoutL2_x, 0 >> gridLayoutL2_x);
        auto ones = _mm256_set1_epi32(~0);
 
        //const auto mask_i = (1 << gridLayoutL2_x) - 1;
        //const auto mask_j = ((1 << gridLayoutL2_y) - 1) << gridLayoutL2_x;
        //const auto mask_k = ((1 << gridLayoutL2.z) - 1) << (gridLayoutL2_x + gridLayoutL2_y);
 
        for (unsigned i = 0; i < gridLayout_z*gridLayout_y*gridLayout_x; i += 32) {
          auto skirt_i = _mm256_cmpeq_epi8(_mm256_and_si256(ki, mask_i_epi8), mask_i_epi8);
          ki = _mm256_add_epi8(ki, stride);
 
          // corner 0,0,0 and corner 1,0,0
          auto t0 = _mm256_load_si256((__m256i*)(codes + i));
          auto t1 = _mm256_loadu_si256((__m256i*)(codes + i + 1));
          auto t2 = _mm256_blendv_epi8(t1, t0, skirt_i);
          auto t3 = _mm256_or_si256(t0, _mm256_slli_epi32(t2, 1));
 
          auto row = _mm256_add_epi8(_mm256_set1_epi8(static_cast<char>(i >> gridLayoutL2_x)), kj);
          auto skirt_j = _mm256_cmpeq_epi8(_mm256_and_si256(row, mask_j_epi8), mask_j_epi8);
 
          // corner 0,1,0 and corner 1,1,0
          auto r0 = _mm256_load_si256((__m256i*)(codes + gridLayout_x + i));
          auto r1 = _mm256_loadu_si256((__m256i*)(codes + gridLayout_x + i + 1));
          auto r2 = _mm256_blendv_epi8(r1, r0, skirt_i);
          auto r3 = _mm256_or_si256(r0, _mm256_slli_epi32(r2, 1));
          auto r4 = _mm256_blendv_epi8(r3, t3, skirt_j);
 
          auto t5 = _mm256_or_si256(t3, _mm256_slli_epi32(r4, 2));  // merge
 
          auto ones_ = _mm256_cmpeq_epi8(_mm256_setzero_si256(), _mm256_setzero_si256());
          auto m2 = _mm256_cmpeq_epi8(t5, ones_);
          auto code_epi8 = _mm256_andnot_si256(m2, t5);
          _mm256_store_si256((__m256i*)(codes + i), code_epi8);
 
          auto m3 = _mm256_cmpeq_epi8(code_epi8, _mm256_setzero_si256());
          uint32_t zerolanes = _mm256_movemask_epi8(m3);
          if (zerolanes == 0xffffffff) continue;
 
          auto skirt_k = _mm256_set1_epi8((i >> (gridLayoutL2_x + gridLayoutL2_y)) == (gridLayout_z - 1) ? 255 : 0);
          auto skirt = _mm256_or_si256(_mm256_or_si256(skirt_i, skirt_j), skirt_k);
          uint32_t skirtbits = _mm256_movemask_epi8(skirt);
 
          for (unsigned r = 0; r < 32; r++) {
            if (((zerolanes >> r) & 1) == 0) {
              auto code = code_epi8.m256i_u8[r];
              offsets[i + r] = vertexCount_ + vertexCount;
 
              auto axes = axesTable[code];
              if (axes & 4) {
                vtxwork[vertexCount++] = ((i + r) << 8) | (4 << 5) | l;
              }
              if (axes & 2) {
                vtxwork[vertexCount++] = ((i + r) << 8) | (2 << 5) | l;
              }
              if (axes & 1) {
                vtxwork[vertexCount++] = ((i + r) << 8) | (1 << 5) | l;
              }
 
              if (((skirtbits >> r) & 1) == 0) {
                occupiedCells++;
                for (auto * axisShifts = indexTable + 16 * code; *axisShifts != 255; axisShifts++) {
                  idxwork[indexCount++] = ((i + r) << 8) | (*axisShifts << 2) | l;
                }
              }
            }
          }
        }
      }
    }
 
    if (auto t = vertexCount; t) {
      while ((t & 7)) vtxwork[t++] = vtxwork[t - 1];
    }
    if (auto t = indexCount; t) {
      while (t & 7) idxwork[t++] = idxwork[indexCount - 1];
    }
 
    vertexCount_ += vertexCount;
    occupiedCells_ += occupiedCells;
    indexCount_ += indexCount;
  }
  //#pragma optimize( "", on )
 
 
//#pragma optimize( "", off )
  size_t vertexExtractAVX2(Vertex* vertices,
                           uint8_t* tmp,
                           const float* values,
                           size_t gridLayoutL2_x,
                           size_t gridLayoutL2_y,
                           size_t gridLayout_z,
                           size_t T_n,
                           const float* thresholds,
                           size_t vn)
  {
    const size_t gridLayout_x = size_t(1) << gridLayoutL2_x;
    const size_t gridLayout_y = size_t(1) << gridLayoutL2_y;
    //const size_t cellLayout_x = gridLayout_x - 1;
    //const size_t cellLayout_y = gridLayout_y - 1;
    const size_t cellLayout_z = gridLayout_z - 1;
    const auto * work = (uint32_t*)(tmp + gridLayout_x * gridLayout_y*gridLayout_z*((sizeof(uint32_t) + sizeof(uint8_t))*T_n));
 
    //const auto mask_i = _mm256_set1_epi32(gridLayout.x - 1);
    const auto mask_j = _mm256_set1_epi32(static_cast<int>(gridLayout_y - 1));
    //const auto mask_k = _mm256_set1_epi32(gridLayout_z - 1);
    const auto cellLayout_z_epi32 = _mm256_set1_epi32(static_cast<int>(cellLayout_z));
 
    const auto X_L2_epi64 = _mm_insert_epi64(_mm_undefined_si128(), gridLayoutL2_x, 0);
    const auto XY_L2_epi64 = _mm_insert_epi64(_mm_undefined_si128(), gridLayoutL2_x + gridLayoutL2_y, 0);
    const auto oneOverTn = _mm256_set1_ps(1.f / T_n);
 
    const auto one = _mm256_set1_epi32(1);
    const auto X = _mm256_set1_epi32(static_cast<int>(gridLayout_x));
    const auto XY = _mm256_set1_epi32(static_cast<int>(gridLayout_x*gridLayout_y));
    const auto C3 = _mm256_setr_ps(thresholds[0], thresholds[1], thresholds[2], thresholds[3],
                                   thresholds[0], thresholds[1], thresholds[2], thresholds[3]);
    for (size_t v = 0; v < vn; v += 8) {
      auto w = _mm256_loadu_si256((__m256i*)(work + v));
      auto o0 = _mm256_srli_epi32(w, 8);
      auto a = _mm256_i32gather_ps(values, o0, 4);
 
      // Extract i,j,k of sample point 0
      auto mask_i = _mm256_sub_epi32(X, one);
      auto i = _mm256_and_si256(o0, mask_i);
      auto j = _mm256_and_si256(_mm256_srl_epi32(o0, X_L2_epi64), mask_j);
      auto k = _mm256_srl_epi32(o0, XY_L2_epi64);
 
      // mask set if i < gridLayout.x-1 => i+1 is safe access.
      auto i0_less = _mm256_cmpgt_epi32(mask_i, i);
      auto j0_less = _mm256_cmpgt_epi32(mask_j, j);
      auto k0_less = _mm256_cmpgt_epi32(cellLayout_z_epi32, k);
 
      // Gather and calc discrete differences at first point
      auto ax = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o0, _mm256_and_si256(i0_less, one)), 4), a);
      auto ay = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o0, _mm256_and_si256(j0_less, X)), 4), a);
      auto az = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o0, _mm256_and_si256(k0_less, XY)), 4), a);
 
      auto di = _mm256_and_si256(_mm256_srli_epi32(w, 5), one);                            // 1 if intersection is on x-axis
      auto dj = _mm256_and_si256(_mm256_srli_epi32(w, 6), one);
      auto dk = _mm256_and_si256(_mm256_srli_epi32(w, 7), one);
 
      // All-set mask if intersection is on x-axis.
      auto mx = _mm256_cmpeq_epi32(di, one);
      auto my = _mm256_cmpeq_epi32(dj, one);
      auto mz = _mm256_cmpeq_epi32(dk, one);
 
      // FOrm index of sample point 1
      auto o1 = _mm256_add_epi32(_mm256_add_epi32(o0, di),
                                 _mm256_add_epi32(_mm256_and_si256(my, X),
                                                  _mm256_and_si256(mz, XY)));
      auto b = _mm256_i32gather_ps(values, o1, 4);
 
      // mask set if k + dk < gridLayout.z-1 => dk+1 is safe access.
      auto i1_less = _mm256_cmpgt_epi32(mask_i, _mm256_add_epi32(i, di));
      auto j1_less = _mm256_cmpgt_epi32(mask_j, _mm256_add_epi32(j, dj));
      auto k1_less = _mm256_cmpgt_epi32(cellLayout_z_epi32, _mm256_add_epi32(k, dk));
 
      // Gather and calc discrete differences at second point
      auto bx = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o1, _mm256_and_si256(i1_less, one)), 4), b);
      auto by = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o1, _mm256_and_si256(j1_less, X)), 4), b);
      auto bz = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o1, _mm256_and_si256(k1_less, XY)), 4), b);
 
      auto l = _mm256_and_si256(w, _mm256_set1_epi32((1 << 5) - 1));
      auto T = _mm256_permutevar_ps(C3, l);
      auto t = _mm256_mul_ps(_mm256_sub_ps(T, a), _mm256_rcp_ps(_mm256_sub_ps(b, a)));
 
      // Interpolate position
      auto px = _mm256_add_ps(_mm256_cvtepi32_ps(i), _mm256_and_ps(_mm256_castsi256_ps(mx), t));
      auto py = _mm256_add_ps(_mm256_cvtepi32_ps(j), _mm256_and_ps(_mm256_castsi256_ps(my), t));
      auto pz = _mm256_add_ps(_mm256_cvtepi32_ps(k), _mm256_and_ps(_mm256_castsi256_ps(mz), t));
 
      // Form negative of interpolated gradient (-a  + t(a-b))
      auto gx = _mm256_sub_ps(_mm256_mul_ps(t, _mm256_sub_ps(ax, bx)), ax);
      auto gy = _mm256_sub_ps(_mm256_mul_ps(t, _mm256_sub_ps(ay, by)), ay);
      auto gz = _mm256_sub_ps(_mm256_mul_ps(t, _mm256_sub_ps(az, bz)), az);
 
      auto half = _mm256_set1_ps(0.5f);
      auto tp = _mm256_mul_ps(oneOverTn, _mm256_add_ps(_mm256_cvtepi32_ps(l), half));
 
      if (v + 8 <= vn) {
        //px = _mm256_setr_ps( 0,  8, 16, 24, 32, 40, 48, 55);
        //py = _mm256_setr_ps( 1,  9, 17, 25, 33, 41, 49, 56);
        //pz = _mm256_setr_ps( 2, 10, 18, 26, 34, 42, 50, 57);
        //gx = _mm256_setr_ps( 3, 11, 19, 27, 35, 43, 51, 58);
        //gy = _mm256_setr_ps( 4, 12, 20, 28, 36, 44, 52, 59);
        //gz = _mm256_setr_ps( 5, 13, 21, 29, 37, 45, 53, 60);
        //tp = _mm256_setr_ps( 6, 14, 22, 30, 38, 46, 54, 61);
 
        auto a0 = _mm256_unpacklo_ps(px, py);                           //  0  1  8  9 | 32 33 40 41
        auto a2 = _mm256_unpacklo_ps(pz, gx);                           //  2  3 10 11 | 34 35 42 43
        auto a4 = _mm256_unpacklo_ps(gy, gz);                           //  4  5 12 13 | 36 37 44 45
        auto a6 = _mm256_unpacklo_ps(tp, half);                         //  5  h 14  h | 38  h 46  h
 
        auto b0_ = _mm256_shuffle_ps(a0, a2, _MM_SHUFFLE(1, 0, 3, 2));  //  8  9  2  3 | 40 41 24 25
        auto b0 = _mm256_blend_ps(a0, b0_, 0xCC); // = 11001100         //  0  1  2  3 | 32 33 34 35
        auto b1 = _mm256_blend_ps(a2, b0_, 0x33); // = 00110011         //  8  9 10 11 | 40 41 42 43
 
        //auto b0 = _mm256_shuffle_ps(a0, a2, _MM_SHUFFLE(1, 0, 1, 0));   //  0  1  2  3 | 32 33 34 35
        //auto b1 = _mm256_shuffle_ps(a0, a2, _MM_SHUFFLE(3, 2, 3, 2));   //  8  9 10 11 | 40 41 42 43
 
        auto b4_ = _mm256_shuffle_ps(a4, a6, _MM_SHUFFLE(1, 0, 3, 2));
        auto b4 = _mm256_blend_ps(a4, b4_, 0xCC);
        auto b5 = _mm256_blend_ps(a6, b4_, 0x33);
        //auto b4 = _mm256_shuffle_ps(a4, a6, _MM_SHUFFLE(1, 0, 1, 0));   //  4  5  6  h | 36 37 38  h
        //auto b5 = _mm256_shuffle_ps(a4, a6, _MM_SHUFFLE(3, 2, 3, 2));   // 12 13 14  h | 44 45 46  h
 
        _mm256_store_ps((float*)vertices + 8 * (v + 0), _mm256_permute2f128_ps(b0, b4, 0x20));
        _mm256_store_ps((float*)vertices + 8 * (v + 4), _mm256_permute2f128_ps(b0, b4, 0x31));
 
        auto a1 = _mm256_unpackhi_ps(px, py);                           // 16 17 24 25 | 48 49 55 56
        auto a3 = _mm256_unpackhi_ps(pz, gx);                           // 18 19 26 27 | 50 51 57 58
 
        auto b2_ = _mm256_shuffle_ps(a1, a3, _MM_SHUFFLE(1, 0, 3, 2));
        auto b2 = _mm256_blend_ps(a1, b2_, 0xCC);
        auto b3 = _mm256_blend_ps(a3, b2_, 0x33);
        //auto b2 = _mm256_shuffle_ps(a1, a3, _MM_SHUFFLE(1, 0, 1, 0));   // 16 17 18 19 | 48 49 50 51
        //auto b3 = _mm256_shuffle_ps(a1, a3, _MM_SHUFFLE(3, 2, 3, 2));   // 24 25 26 27 | 55 56 57 58
 
        auto a5 = _mm256_unpackhi_ps(gy, gz);                           // 20 21 28 29 | 52 53 59 60
        auto a7 = _mm256_unpackhi_ps(tp, half);                         // 22  h 30  h | 54  h 61  h
 
        auto b6_ = _mm256_shuffle_ps(a5, a7, _MM_SHUFFLE(1, 0, 3, 2));
        auto b6 = _mm256_blend_ps(a5, b6_, 0xCC);
        auto b7 = _mm256_blend_ps(a7, b6_, 0x33);
        //auto b6 = _mm256_shuffle_ps(a5, a7, _MM_SHUFFLE(1, 0, 1, 0));   // 20 21 22  h | 52 53 54  h
        //auto b7 = _mm256_shuffle_ps(a5, a7, _MM_SHUFFLE(3, 2, 3, 2));   // 28 29 30  h | 59 60 61  h
 
 
        _mm256_store_ps((float*)vertices + 8 * (v + 1), _mm256_permute2f128_ps(b1, b5, 0x20));
        _mm256_store_ps((float*)vertices + 8 * (v + 2), _mm256_permute2f128_ps(b2, b6, 0x20));
        _mm256_store_ps((float*)vertices + 8 * (v + 3), _mm256_permute2f128_ps(b3, b7, 0x20));
        _mm256_store_ps((float*)vertices + 8 * (v + 5), _mm256_permute2f128_ps(b1, b5, 0x31));
        _mm256_store_ps((float*)vertices + 8 * (v + 6), _mm256_permute2f128_ps(b2, b6, 0x31));
        _mm256_store_ps((float*)vertices + 8 * (v + 7), _mm256_permute2f128_ps(b3, b7, 0x31));
      }
      else {
        for (size_t ll = 0; ll < 8 && v + ll < vn; ll++) {
          vertices[v + ll].pos.x = px.m256_f32[ll];
          vertices[v + ll].pos.y = py.m256_f32[ll];
          vertices[v + ll].pos.z = pz.m256_f32[ll];
          vertices[v + ll].nrm.x = gx.m256_f32[ll];
          vertices[v + ll].nrm.y = gy.m256_f32[ll];
          vertices[v + ll].nrm.z = gz.m256_f32[ll];
          vertices[v + ll].tex.x = tp.m256_f32[ll];
          vertices[v + ll].tex.y = 0.5f;
        }
      }
 
    }
    //assert(vix == vo);
    return vn;
  }
  //#pragma optimize( "", on )
 
  unsigned indexExtract(uint32_t* sub_mesh,
                        Idx* indices,
                        uint8_t* tmp,
                        const size3_t gridLayout,
                        const size_t T_n,
                        const size_t /*indexCount*/,
                        const size_t vertexCount)
  {
    const size3_t cellLayout = { gridLayout.x - 1, gridLayout.y - 1, gridLayout.z - 1 };
    const auto * axesTable = MarchingCubes::axesTable().data();
    const auto * indexTable = MarchingCubes::indexTable().data();
 
    unsigned iix = 0;
    for (unsigned l = 0; l < T_n; l++) {
      auto * offsets = (uint32_t*)(tmp + gridLayout.x*gridLayout.y*gridLayout.z * sizeof(uint32_t)*l);
      auto * codes = tmp + gridLayout.x*gridLayout.y*gridLayout.z*(sizeof(uint32_t)*T_n + l);
      auto * c = codes;
      for (unsigned k = 0; k < cellLayout.z; k++) {
        for (unsigned j = 0; j < cellLayout.y; j++) {
          for (unsigned i = 0; i < cellLayout.x; i++) {
            auto code = *c++;
            if (code == 0) continue;
 
            auto * axisShifts = indexTable + 16 * code;
            while (true)
            {
              auto axisShift = *axisShifts++;
              if (axisShift == 255) break;
 
              auto ii = i + (axisShift & 8 ? 1 : 0);
              auto jj = j + (axisShift & 16 ? 1 : 0);
              auto kk = k + (axisShift & 32 ? 1 : 0);
 
              auto shiftedCell = (kk*gridLayout.y + jj)*gridLayout.x + ii;
 
              const auto shiftedCode = codes[shiftedCell];
              const auto axes = axesTable[shiftedCode] & axisShift;
 
              const auto ix = offsets[shiftedCell] + ((axes & 4) ? 1 : 0) + ((axes & 2) ? 1 : 0);
              assert(ix < vertexCount);
 
              assert(sizeof(Idx)!=2 || ix < 0xffff);
              indices[iix++] = (Idx)ix;
            }
          }
 
          c++;
        }
        c += gridLayout.x;
      }
      sub_mesh[l+1] = iix;
    }
 
    return iix;
  }
 
//#pragma optimize( "", off)
  unsigned indexExtractAVX2(uint32_t* indices,
                            uint8_t* tmp,
                            unsigned gridLayoutL2_x,
                            unsigned gridLayoutL2_y,
                            unsigned gridLayout_z,
                            const unsigned T_n,
                            const unsigned indexCount,
                            const unsigned maxVertex)
  {
    const unsigned gridLayout_x = 1 << gridLayoutL2_x;
    const unsigned gridLayout_y = 1 << gridLayoutL2_y;
    //const unsigned cellLayout_x = gridLayout_x - 1;
    //const unsigned cellLayout_y = gridLayout_y - 1;
    //const unsigned cellLayout_z = gridLayout_z - 1;
 
    const auto * axesTable = MarchingCubes::axesTable().data();
 
    const auto * idxwork = (uint32_t*)(tmp + gridLayout_x*gridLayout_y*gridLayout_z*((sizeof(uint32_t) + sizeof(uint8_t) + 3 * sizeof(uint32_t))*T_n));
    const auto * codes = tmp + gridLayout_x*gridLayout_y*gridLayout_z*(sizeof(uint32_t)*T_n);
    const auto * offsets = (uint32_t*)tmp;
 
    const auto one_epi32 = _mm256_set1_epi32(1);
    const auto three_epi32 = _mm256_set1_epi32(3);
 
 
    //const auto gridL2_3_epi64 = _mm_set1_epi64x(gridLayoutL2_x + gridLayoutL2_y + gridLayoutL2_z);
    const auto grid_3_epi32 = _mm256_set1_epi32(gridLayout_x*gridLayout_y*gridLayout_z);
 
    const auto X_epi32 = _mm256_set1_epi32(gridLayout_x);
    const auto XY_epi32 = _mm256_set1_epi32(gridLayout_x*gridLayout_y);
    const auto mask_255_epi32 = _mm256_set1_epi32(255);
    for (unsigned iix = 0; iix < indexCount; iix += 8) {
      const auto w = _mm256_load_si256((const __m256i*)(idxwork + iix));
      const auto o0 = _mm256_srli_epi32(w, 8);
 
      // (((w >> 5) & 1), (((w >> 6) & 1) ? gridLayout.x : 0), (((w >> 7) & 1) ? gridLayout.x*gridLayout.y : 0))
      const auto shiftI = _mm256_and_si256(_mm256_srli_epi32(w, 5), one_epi32);
      const auto shiftJ = _mm256_and_si256(_mm256_cmpeq_epi32(_mm256_and_si256(_mm256_srli_epi32(w, 6), one_epi32), one_epi32), X_epi32);
      const auto shiftK = _mm256_and_si256(_mm256_cmpeq_epi32(_mm256_and_si256(_mm256_srli_epi32(w, 7), one_epi32), one_epi32), XY_epi32);
 
      const auto shiftedCell = _mm256_add_epi32(_mm256_add_epi32(o0, shiftI),
                                                _mm256_add_epi32(shiftJ, shiftK));
 
      //const auto L = _mm256_sll_epi32(_mm256_and_si256(w, three_epi32), gridL2_3_epi64);  // l << (gridLayoutL2.x + gridLayoutL2.y + gridLayoutL2.z)
      
      const auto L = _mm256_mullo_epi32(grid_3_epi32, _mm256_and_si256(w, three_epi32));
      
      const auto I = _mm256_add_epi32(shiftedCell, L);
 
      const auto shiftedCode = _mm256_and_si256(_mm256_i32gather_epi32((const int*)codes, I, 1), mask_255_epi32);
      const auto axisShift = _mm256_srli_epi32(w, 2);
      const auto axes = _mm256_and_si256(_mm256_i32gather_epi32((const int*)axesTable, shiftedCode, 1), axisShift);
 
      const auto addJ = _mm256_and_si256(_mm256_srli_epi32(axes, 1), one_epi32);
      const auto addK = _mm256_and_si256(_mm256_srli_epi32(axes, 2), one_epi32);
      const auto offset = _mm256_add_epi32(_mm256_i32gather_epi32((const int*)offsets, I, 4),
                                           _mm256_add_epi32(addJ, addK));
 
      for (unsigned i = 0; i < 8; i++) {
        assert(offset.m256i_u32[i] < maxVertex);
      }
      _mm256_storeu_si256((__m256i*)(indices + iix), offset);
    }
 
    return indexCount;
  }
  //#pragma optimize( "", on )
 
 
}
 
namespace Cogs::Core::EchoSounder {
 
 
//#pragma optimize( "", off )
  MeshHandle createIsoSurfacesSplitAVX2(Context* context,
                                        uint64_t& analyze,
                                        uint64_t& vtx,
                                        uint64_t& idx,
                                        Cogs::Memory::MemoryBuffer& scratch,
                                        const float* values,
                                        const glm::uvec3 gridLayout_,
                                        const glm::vec3 /*minCorner*/,
                                        const glm::vec3 /*maxCorner*/,
                                        const float *thresholds, size_t count)
  {
    size3_t gridLayout = { gridLayout_.x, gridLayout_.y, gridLayout_.z };
    size3_t gridLayoutL2 = {
      static_cast<size_t>(log2(gridLayout_.x)),
      static_cast<size_t>(log2(gridLayout_.y)),
      static_cast<size_t>(log2(gridLayout_.z))
    };
    assert(gridLayout.x != 0 && gridLayout.y != 0 && gridLayout.z != 0);
    assert((size_t(1) << gridLayoutL2.x) == gridLayout.x &&
           (size_t(1) << gridLayoutL2.y) == gridLayout.y &&
           (size_t(1) << gridLayoutL2.z) == gridLayout.z);
 
    const size_t T_n = count;
 
    // !!!!! This is where you set number of parts !!!!!
    const size_t parts = 2;
    assert((parts < gridLayout.z - 1) && "Parts must be at least a slice thick");
 
    const size_t partSpacing = (gridLayout.z - 2) / parts;
    const size_t partSize = partSpacing + 1;
    assert(1 <= partSpacing);
    assert(parts*partSpacing + 1 < gridLayout.z);
    assert(gridLayout.z <= (parts + 1)*partSpacing);
 
 
    size_t partScratchSize = (sizeof(uint32_t) + sizeof(uint8_t) + 3 * sizeof(uint32_t) + 15 * sizeof(uint32_t))*T_n*gridLayout.x*gridLayout.y*partSize;
    scratch.resize(parts * partScratchSize, false);
    auto * tmp = (uint8_t*)scratch.data();
    //std::memset(tmp, -1, scratch.size());
 
    unsigned occupiedCells = 0;
    unsigned indexCount = 0;
    unsigned vertexCount = 0;
    std::vector<unsigned> partVertexOffset(parts + 1);
    std::vector<unsigned> partIndexOffset(parts + 1);
 
    // Analyze pass
    auto timer = Timer::startNew();
    for (size_t p = 0; p < parts; p++) {
      size_t slice_a = partSpacing * p;
      size_t slice_b = std::min(slice_a + partSize, gridLayout.z);
 
      partVertexOffset[p] = vertexCount;
      partIndexOffset[p] = indexCount;
      analyzeAVX2(tmp + partScratchSize * p, vertexCount, occupiedCells, indexCount,
                  values + slice_a * gridLayout.x*gridLayout.y,
                  gridLayoutL2.x, gridLayoutL2.y, slice_b - slice_a,
                  T_n, thresholds);
    }
    partVertexOffset[parts] = vertexCount;
    partIndexOffset[parts] = indexCount;
    analyze = timer.elapsedMicroseconds();
 
    if (occupiedCells == 0) {
      vtx = 0;
      idx = 0;
      return MeshHandle::NoHandle;
    }
 
    auto mesh = context->meshManager->createLocked();
    //mesh->setBounds(Cogs::Geometry::BoundingBox{ glm::vec3(0,0,0), glm::vec3(gridLayout.x, gridLayout.y, gridLayout.z) });
    mesh->setMeshFlag(MeshFlags::ClockwiseWinding);
    mesh->primitiveType = PrimitiveType::TriangleList;
    auto vertices = mesh->map<Vertex>(VertexDataType::Interleaved0, VertexFormats::Pos3fNorm3fTex2f, vertexCount);
 
    // Extract vertices
    timer = Timer::startNew();
    for (size_t p = 0; p < parts; p++) {
      size_t vn = partVertexOffset[p + 1] - partVertexOffset[p];
      if (vn == 0) continue;
 
      size_t slice_a = partSpacing * p;
      size_t slice_b = std::min(slice_a + partSize, gridLayout.z);
      vertexExtractAVX2(vertices.data + partVertexOffset[p],      // assumed to be 32b aligned since we use aligned stores.
                        tmp + partScratchSize * p,
                        values + slice_a * gridLayout.x*gridLayout.y,
                        gridLayoutL2.x, gridLayoutL2.y, slice_b - slice_a,
                        T_n, thresholds, vn);
    }
    vtx = timer.elapsedMicroseconds();
 
    // extract indices
    // std::vector<uint32_t> indices(indexCount+8);
    // timer = Timer::startNew();
    // for (unsigned p = 0; p < parts; p++) {
    //   auto ixn = partIndexOffset[p + 1] - partIndexOffset[p];
    //   if (ixn == 0) continue;
    //   unsigned slice_a = partSpacing * p;
    //   unsigned slice_b = std::min(slice_a + partSize, gridLayout.z);
    //   indexExtractAVX2(indices.data() + partIndexOffset[p],
    //                    tmp + partScratchSize * p,
    //                    gridLayoutL2.x, gridLayoutL2.y, slice_b - slice_a,
    //                    T_n, ixn, partVertexOffset[p + 1]);
    // }
    // idx = timer.elapsedMicroseconds();
 
    std::vector<uint32_t> sub_mesh(T_n+1, 0);
    mesh->clearIndexes();
    Idx *indices = (Idx*)mesh->mapStream(VertexDataType::Indexes, 0, indexCount, sizeof(Idx), true);
    timer = Timer::startNew();
    auto iix = indexExtract(sub_mesh.data(), indices, tmp, gridLayout, T_n, indexCount, vertexCount);
    assert(iix == indexCount);
    mesh->setMeshFlag(MeshFlags::Indexed);
    mesh->setMeshFlag(MeshFlags::IndexesChanged);
    idx = timer.elapsedMicroseconds();
 
    auto subMeshes = mesh->mapSubMeshes(static_cast<uint32_t>(T_n));
    for (size_t l = 0; l < T_n; l++) {
      uint32_t start = sub_mesh[l];
      uint32_t size = sub_mesh[l+1]-sub_mesh[l];
      subMeshes[l] = {start, size, PrimitiveType::TriangleList};
    }
    mesh->setCount(indexCount);
 
    auto box = Geometry::makeEmptyBoundingBox<Geometry::BoundingBox>();
    for(unsigned i=0; i<vertexCount; i++){
      glm::vec3 pos = vertices[i].pos;
      box.min = glm::min(box.min, pos);
      box.max = glm::max(box.max, pos);
    }
    mesh->setBounds(box);
 
    _mm256_zeroupper();
    return mesh.getHandle();
  }
  //#pragma optimize( "", on )
 
}