Cogs.Core
UniformGridSystem_isosurf_split_avx2.cpp
1
2#include "Resources/Resources.h"
3#include "Resources/MeshManager.h"
4#include "Resources/VertexFormats.h"
5
6#include "Context.h"
7
8#include "../../../IsoSurfaces/MarchingCubesTables.h"
9
10#include "Foundation/Logging/Logger.h"
11#include "Foundation/Memory/MemoryBuffer.h"
12#include "Foundation/Platform/Timer.h"
13
14#include <glm/glm.hpp>
15
16namespace {
17 using namespace Cogs::Core;
18
19 Cogs::Logging::Log logger = Cogs::Logging::getLogger("UniformGridSystem");
20
21 struct Vertex
22 {
23 glm::vec3 pos;
24 glm::vec3 nrm;
25 glm::vec2 tex;
26 };
27 typedef uint16_t Idx;
28 struct size3_t { size_t x, y, z; };
29
30//#pragma optimize( "", off )
31 void analyzeAVX2(uint8_t* tmp,
32 unsigned& vertexCount_,
33 unsigned& occupiedCells_,
34 unsigned& indexCount_,
35 const float* values,
36 size_t gridLayoutL2_x,
37 size_t gridLayoutL2_y,
38 size_t gridLayout_z,
39 const size_t T_n,
40 const float* thresholds)
41 {
42 const size_t gridLayout_x = size_t(1) << gridLayoutL2_x;
43 const size_t gridLayout_y = size_t(1) << gridLayoutL2_y;
44 //const size_t cellLayout_x = gridLayout_x - 1;
45 //const size_t cellLayout_y = gridLayout_y - 1;
46 const size_t cellLayout_z = gridLayout_z - 1;
47
48 assert((gridLayout_x & 7) == 0 && (gridLayout_y & 7) == 0);
49 assert((gridLayout_y*gridLayout_x & 31) == 0);
50 assert(gridLayout_x <= 256 && "x must fit inside a byte");
51
52 const auto * axesTable = MarchingCubes::axesTable().data();
53 const auto * indexTable = MarchingCubes::indexTable().data();
54
55 auto * vtxwork = (uint32_t*)(tmp + gridLayout_x * gridLayout_y*gridLayout_z*((sizeof(uint32_t) + sizeof(uint8_t))*T_n));
56 auto * idxwork = (uint32_t*)(tmp + gridLayout_x * gridLayout_y*gridLayout_z*((sizeof(uint32_t) + sizeof(uint8_t) + 3 * sizeof(uint32_t))*T_n));
57
58 unsigned vertexCount = 0;
59 unsigned occupiedCells = 0;
60 unsigned indexCount = 0;
61 for (unsigned l = 0; l < T_n; l++) {
62
63 auto * offsets = (uint32_t*)(tmp + gridLayout_x * gridLayout_y*gridLayout_z * sizeof(uint32_t)*l);
64 auto * codes = (tmp + gridLayout_x * gridLayout_y*gridLayout_z*(sizeof(uint32_t)*T_n + l));
65
66 auto T = _mm256_set1_ps(thresholds[l]);
67 {
68 auto M = _mm256_set1_epi32(1);
69 auto * c = codes;
70 for (unsigned k = 0; k < gridLayout_z*gridLayout_y*gridLayout_x; k += 8) {
71 auto t0 = _mm256_load_ps(values + k);
72 auto t1 = _mm256_and_si256(M, _mm256_castps_si256(_mm256_cmp_ps(t0, T, _CMP_LT_OQ)));
73 auto t2 = _mm256_extracti128_si256(t1, 1);
74 auto t3 = _mm256_extracti128_si256(t1, 0);
75 auto t4 = _mm_packus_epi32(t3, t2);
76 auto t5 = _mm_packus_epi16(t4, _mm_setzero_si128());
77 _mm_storel_epi64((__m128i*)c, t5);
78 c += 8;
79 }
80 }
81 { // merge across slices
82 auto * c = (__m256i*)codes;
83 const auto M = (gridLayout_y*gridLayout_x) / sizeof(__m256i);
84 for (unsigned k = 0; k < cellLayout_z; k++) {
85 for (unsigned j = 0; j < M; j++) {
86 auto t0 = _mm256_load_si256((__m256i*)c);
87 auto t1 = _mm256_load_si256((__m256i*)(c + M));
88 auto t2 = _mm256_slli_epi32(t1, 4);
89 auto t3 = _mm256_or_si256(t0, t2);
90 _mm256_store_si256(c++, t3);
91 }
92 }
93 for (unsigned j = 0; j < M; j++) {
94 auto t0 = _mm256_load_si256((__m256i*)c);
95 auto t2 = _mm256_slli_epi32(t0, 4);
96 auto t3 = _mm256_or_si256(t0, t2);
97 _mm256_store_si256(c++, t3);
98 }
99 }
100 {
101 auto mask_i_epi8 = _mm256_set1_epi8(static_cast<char>(gridLayout_x - 1));
102 auto mask_j_epi8 = _mm256_set1_epi8(static_cast<char>(gridLayout_y - 1));
103 auto stride = _mm256_set1_epi8(32);
104 auto ki = _mm256_set_epi8(31, 30, 29, 28,
105 27, 26, 25, 24,
106 23, 22, 21, 20,
107 19, 18, 17, 16,
108 15, 14, 13, 12,
109 11, 10, 9, 8,
110 7, 6, 5, 4,
111 3, 2, 1, 0);
112 auto kj = _mm256_set_epi8(31 >> gridLayoutL2_x, 30 >> gridLayoutL2_x, 29 >> gridLayoutL2_x, 28 >> gridLayoutL2_x,
113 27 >> gridLayoutL2_x, 26 >> gridLayoutL2_x, 25 >> gridLayoutL2_x, 24 >> gridLayoutL2_x,
114 23 >> gridLayoutL2_x, 22 >> gridLayoutL2_x, 21 >> gridLayoutL2_x, 20 >> gridLayoutL2_x,
115 19 >> gridLayoutL2_x, 18 >> gridLayoutL2_x, 17 >> gridLayoutL2_x, 16 >> gridLayoutL2_x,
116 15 >> gridLayoutL2_x, 14 >> gridLayoutL2_x, 13 >> gridLayoutL2_x, 12 >> gridLayoutL2_x,
117 11 >> gridLayoutL2_x, 10 >> gridLayoutL2_x, 9 >> gridLayoutL2_x, 8 >> gridLayoutL2_x,
118 7 >> gridLayoutL2_x, 6 >> gridLayoutL2_x, 5 >> gridLayoutL2_x, 4 >> gridLayoutL2_x,
119 3 >> gridLayoutL2_x, 2 >> gridLayoutL2_x, 1 >> gridLayoutL2_x, 0 >> gridLayoutL2_x);
120 auto ones = _mm256_set1_epi32(~0);
121
122 //const auto mask_i = (1 << gridLayoutL2_x) - 1;
123 //const auto mask_j = ((1 << gridLayoutL2_y) - 1) << gridLayoutL2_x;
124 //const auto mask_k = ((1 << gridLayoutL2.z) - 1) << (gridLayoutL2_x + gridLayoutL2_y);
125
126 for (unsigned i = 0; i < gridLayout_z*gridLayout_y*gridLayout_x; i += 32) {
127 auto skirt_i = _mm256_cmpeq_epi8(_mm256_and_si256(ki, mask_i_epi8), mask_i_epi8);
128 ki = _mm256_add_epi8(ki, stride);
129
130 // corner 0,0,0 and corner 1,0,0
131 auto t0 = _mm256_load_si256((__m256i*)(codes + i));
132 auto t1 = _mm256_loadu_si256((__m256i*)(codes + i + 1));
133 auto t2 = _mm256_blendv_epi8(t1, t0, skirt_i);
134 auto t3 = _mm256_or_si256(t0, _mm256_slli_epi32(t2, 1));
135
136 auto row = _mm256_add_epi8(_mm256_set1_epi8(static_cast<char>(i >> gridLayoutL2_x)), kj);
137 auto skirt_j = _mm256_cmpeq_epi8(_mm256_and_si256(row, mask_j_epi8), mask_j_epi8);
138
139 // corner 0,1,0 and corner 1,1,0
140 auto r0 = _mm256_load_si256((__m256i*)(codes + gridLayout_x + i));
141 auto r1 = _mm256_loadu_si256((__m256i*)(codes + gridLayout_x + i + 1));
142 auto r2 = _mm256_blendv_epi8(r1, r0, skirt_i);
143 auto r3 = _mm256_or_si256(r0, _mm256_slli_epi32(r2, 1));
144 auto r4 = _mm256_blendv_epi8(r3, t3, skirt_j);
145
146 auto t5 = _mm256_or_si256(t3, _mm256_slli_epi32(r4, 2)); // merge
147
148 auto ones_ = _mm256_cmpeq_epi8(_mm256_setzero_si256(), _mm256_setzero_si256());
149 auto m2 = _mm256_cmpeq_epi8(t5, ones_);
150 auto code_epi8 = _mm256_andnot_si256(m2, t5);
151 _mm256_store_si256((__m256i*)(codes + i), code_epi8);
152
153 auto m3 = _mm256_cmpeq_epi8(code_epi8, _mm256_setzero_si256());
154 uint32_t zerolanes = _mm256_movemask_epi8(m3);
155 if (zerolanes == 0xffffffff) continue;
156
157 auto skirt_k = _mm256_set1_epi8((i >> (gridLayoutL2_x + gridLayoutL2_y)) == (gridLayout_z - 1) ? 255 : 0);
158 auto skirt = _mm256_or_si256(_mm256_or_si256(skirt_i, skirt_j), skirt_k);
159 uint32_t skirtbits = _mm256_movemask_epi8(skirt);
160
161 for (unsigned r = 0; r < 32; r++) {
162 if (((zerolanes >> r) & 1) == 0) {
163 auto code = code_epi8.m256i_u8[r];
164 offsets[i + r] = vertexCount_ + vertexCount;
165
166 auto axes = axesTable[code];
167 if (axes & 4) {
168 vtxwork[vertexCount++] = ((i + r) << 8) | (4 << 5) | l;
169 }
170 if (axes & 2) {
171 vtxwork[vertexCount++] = ((i + r) << 8) | (2 << 5) | l;
172 }
173 if (axes & 1) {
174 vtxwork[vertexCount++] = ((i + r) << 8) | (1 << 5) | l;
175 }
176
177 if (((skirtbits >> r) & 1) == 0) {
178 occupiedCells++;
179 for (auto * axisShifts = indexTable + 16 * code; *axisShifts != 255; axisShifts++) {
180 idxwork[indexCount++] = ((i + r) << 8) | (*axisShifts << 2) | l;
181 }
182 }
183 }
184 }
185 }
186 }
187 }
188
189 if (auto t = vertexCount; t) {
190 while ((t & 7)) vtxwork[t++] = vtxwork[t - 1];
191 }
192 if (auto t = indexCount; t) {
193 while (t & 7) idxwork[t++] = idxwork[indexCount - 1];
194 }
195
196 vertexCount_ += vertexCount;
197 occupiedCells_ += occupiedCells;
198 indexCount_ += indexCount;
199 }
200 //#pragma optimize( "", on )
201
202
203//#pragma optimize( "", off )
204 size_t vertexExtractAVX2(Vertex* vertices,
205 uint8_t* tmp,
206 const float* values,
207 size_t gridLayoutL2_x,
208 size_t gridLayoutL2_y,
209 size_t gridLayout_z,
210 size_t T_n,
211 const float* thresholds,
212 size_t vn)
213 {
214 const size_t gridLayout_x = size_t(1) << gridLayoutL2_x;
215 const size_t gridLayout_y = size_t(1) << gridLayoutL2_y;
216 //const size_t cellLayout_x = gridLayout_x - 1;
217 //const size_t cellLayout_y = gridLayout_y - 1;
218 const size_t cellLayout_z = gridLayout_z - 1;
219 const auto * work = (uint32_t*)(tmp + gridLayout_x * gridLayout_y*gridLayout_z*((sizeof(uint32_t) + sizeof(uint8_t))*T_n));
220
221 //const auto mask_i = _mm256_set1_epi32(gridLayout.x - 1);
222 const auto mask_j = _mm256_set1_epi32(static_cast<int>(gridLayout_y - 1));
223 //const auto mask_k = _mm256_set1_epi32(gridLayout_z - 1);
224 const auto cellLayout_z_epi32 = _mm256_set1_epi32(static_cast<int>(cellLayout_z));
225
226 const auto X_L2_epi64 = _mm_insert_epi64(_mm_undefined_si128(), gridLayoutL2_x, 0);
227 const auto XY_L2_epi64 = _mm_insert_epi64(_mm_undefined_si128(), gridLayoutL2_x + gridLayoutL2_y, 0);
228 const auto oneOverTn = _mm256_set1_ps(1.f / T_n);
229
230 const auto one = _mm256_set1_epi32(1);
231 const auto X = _mm256_set1_epi32(static_cast<int>(gridLayout_x));
232 const auto XY = _mm256_set1_epi32(static_cast<int>(gridLayout_x*gridLayout_y));
233 const auto C3 = _mm256_setr_ps(thresholds[0], thresholds[1], thresholds[2], thresholds[3],
234 thresholds[0], thresholds[1], thresholds[2], thresholds[3]);
235 for (size_t v = 0; v < vn; v += 8) {
236 auto w = _mm256_loadu_si256((__m256i*)(work + v));
237 auto o0 = _mm256_srli_epi32(w, 8);
238 auto a = _mm256_i32gather_ps(values, o0, 4);
239
240 // Extract i,j,k of sample point 0
241 auto mask_i = _mm256_sub_epi32(X, one);
242 auto i = _mm256_and_si256(o0, mask_i);
243 auto j = _mm256_and_si256(_mm256_srl_epi32(o0, X_L2_epi64), mask_j);
244 auto k = _mm256_srl_epi32(o0, XY_L2_epi64);
245
246 // mask set if i < gridLayout.x-1 => i+1 is safe access.
247 auto i0_less = _mm256_cmpgt_epi32(mask_i, i);
248 auto j0_less = _mm256_cmpgt_epi32(mask_j, j);
249 auto k0_less = _mm256_cmpgt_epi32(cellLayout_z_epi32, k);
250
251 // Gather and calc discrete differences at first point
252 auto ax = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o0, _mm256_and_si256(i0_less, one)), 4), a);
253 auto ay = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o0, _mm256_and_si256(j0_less, X)), 4), a);
254 auto az = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o0, _mm256_and_si256(k0_less, XY)), 4), a);
255
256 auto di = _mm256_and_si256(_mm256_srli_epi32(w, 5), one); // 1 if intersection is on x-axis
257 auto dj = _mm256_and_si256(_mm256_srli_epi32(w, 6), one);
258 auto dk = _mm256_and_si256(_mm256_srli_epi32(w, 7), one);
259
260 // All-set mask if intersection is on x-axis.
261 auto mx = _mm256_cmpeq_epi32(di, one);
262 auto my = _mm256_cmpeq_epi32(dj, one);
263 auto mz = _mm256_cmpeq_epi32(dk, one);
264
265 // FOrm index of sample point 1
266 auto o1 = _mm256_add_epi32(_mm256_add_epi32(o0, di),
267 _mm256_add_epi32(_mm256_and_si256(my, X),
268 _mm256_and_si256(mz, XY)));
269 auto b = _mm256_i32gather_ps(values, o1, 4);
270
271 // mask set if k + dk < gridLayout.z-1 => dk+1 is safe access.
272 auto i1_less = _mm256_cmpgt_epi32(mask_i, _mm256_add_epi32(i, di));
273 auto j1_less = _mm256_cmpgt_epi32(mask_j, _mm256_add_epi32(j, dj));
274 auto k1_less = _mm256_cmpgt_epi32(cellLayout_z_epi32, _mm256_add_epi32(k, dk));
275
276 // Gather and calc discrete differences at second point
277 auto bx = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o1, _mm256_and_si256(i1_less, one)), 4), b);
278 auto by = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o1, _mm256_and_si256(j1_less, X)), 4), b);
279 auto bz = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o1, _mm256_and_si256(k1_less, XY)), 4), b);
280
281 auto l = _mm256_and_si256(w, _mm256_set1_epi32((1 << 5) - 1));
282 auto T = _mm256_permutevar_ps(C3, l);
283 auto t = _mm256_mul_ps(_mm256_sub_ps(T, a), _mm256_rcp_ps(_mm256_sub_ps(b, a)));
284
285 // Interpolate position
286 auto px = _mm256_add_ps(_mm256_cvtepi32_ps(i), _mm256_and_ps(_mm256_castsi256_ps(mx), t));
287 auto py = _mm256_add_ps(_mm256_cvtepi32_ps(j), _mm256_and_ps(_mm256_castsi256_ps(my), t));
288 auto pz = _mm256_add_ps(_mm256_cvtepi32_ps(k), _mm256_and_ps(_mm256_castsi256_ps(mz), t));
289
290 // Form negative of interpolated gradient (-a + t(a-b))
291 auto gx = _mm256_sub_ps(_mm256_mul_ps(t, _mm256_sub_ps(ax, bx)), ax);
292 auto gy = _mm256_sub_ps(_mm256_mul_ps(t, _mm256_sub_ps(ay, by)), ay);
293 auto gz = _mm256_sub_ps(_mm256_mul_ps(t, _mm256_sub_ps(az, bz)), az);
294
295 auto half = _mm256_set1_ps(0.5f);
296 auto tp = _mm256_mul_ps(oneOverTn, _mm256_add_ps(_mm256_cvtepi32_ps(l), half));
297
298 if (v + 8 <= vn) {
299 //px = _mm256_setr_ps( 0, 8, 16, 24, 32, 40, 48, 55);
300 //py = _mm256_setr_ps( 1, 9, 17, 25, 33, 41, 49, 56);
301 //pz = _mm256_setr_ps( 2, 10, 18, 26, 34, 42, 50, 57);
302 //gx = _mm256_setr_ps( 3, 11, 19, 27, 35, 43, 51, 58);
303 //gy = _mm256_setr_ps( 4, 12, 20, 28, 36, 44, 52, 59);
304 //gz = _mm256_setr_ps( 5, 13, 21, 29, 37, 45, 53, 60);
305 //tp = _mm256_setr_ps( 6, 14, 22, 30, 38, 46, 54, 61);
306
307 auto a0 = _mm256_unpacklo_ps(px, py); // 0 1 8 9 | 32 33 40 41
308 auto a2 = _mm256_unpacklo_ps(pz, gx); // 2 3 10 11 | 34 35 42 43
309 auto a4 = _mm256_unpacklo_ps(gy, gz); // 4 5 12 13 | 36 37 44 45
310 auto a6 = _mm256_unpacklo_ps(tp, half); // 5 h 14 h | 38 h 46 h
311
312 auto b0_ = _mm256_shuffle_ps(a0, a2, _MM_SHUFFLE(1, 0, 3, 2)); // 8 9 2 3 | 40 41 24 25
313 auto b0 = _mm256_blend_ps(a0, b0_, 0xCC); // = 11001100 // 0 1 2 3 | 32 33 34 35
314 auto b1 = _mm256_blend_ps(a2, b0_, 0x33); // = 00110011 // 8 9 10 11 | 40 41 42 43
315
316 //auto b0 = _mm256_shuffle_ps(a0, a2, _MM_SHUFFLE(1, 0, 1, 0)); // 0 1 2 3 | 32 33 34 35
317 //auto b1 = _mm256_shuffle_ps(a0, a2, _MM_SHUFFLE(3, 2, 3, 2)); // 8 9 10 11 | 40 41 42 43
318
319 auto b4_ = _mm256_shuffle_ps(a4, a6, _MM_SHUFFLE(1, 0, 3, 2));
320 auto b4 = _mm256_blend_ps(a4, b4_, 0xCC);
321 auto b5 = _mm256_blend_ps(a6, b4_, 0x33);
322 //auto b4 = _mm256_shuffle_ps(a4, a6, _MM_SHUFFLE(1, 0, 1, 0)); // 4 5 6 h | 36 37 38 h
323 //auto b5 = _mm256_shuffle_ps(a4, a6, _MM_SHUFFLE(3, 2, 3, 2)); // 12 13 14 h | 44 45 46 h
324
325 _mm256_store_ps((float*)vertices + 8 * (v + 0), _mm256_permute2f128_ps(b0, b4, 0x20));
326 _mm256_store_ps((float*)vertices + 8 * (v + 4), _mm256_permute2f128_ps(b0, b4, 0x31));
327
328 auto a1 = _mm256_unpackhi_ps(px, py); // 16 17 24 25 | 48 49 55 56
329 auto a3 = _mm256_unpackhi_ps(pz, gx); // 18 19 26 27 | 50 51 57 58
330
331 auto b2_ = _mm256_shuffle_ps(a1, a3, _MM_SHUFFLE(1, 0, 3, 2));
332 auto b2 = _mm256_blend_ps(a1, b2_, 0xCC);
333 auto b3 = _mm256_blend_ps(a3, b2_, 0x33);
334 //auto b2 = _mm256_shuffle_ps(a1, a3, _MM_SHUFFLE(1, 0, 1, 0)); // 16 17 18 19 | 48 49 50 51
335 //auto b3 = _mm256_shuffle_ps(a1, a3, _MM_SHUFFLE(3, 2, 3, 2)); // 24 25 26 27 | 55 56 57 58
336
337 auto a5 = _mm256_unpackhi_ps(gy, gz); // 20 21 28 29 | 52 53 59 60
338 auto a7 = _mm256_unpackhi_ps(tp, half); // 22 h 30 h | 54 h 61 h
339
340 auto b6_ = _mm256_shuffle_ps(a5, a7, _MM_SHUFFLE(1, 0, 3, 2));
341 auto b6 = _mm256_blend_ps(a5, b6_, 0xCC);
342 auto b7 = _mm256_blend_ps(a7, b6_, 0x33);
343 //auto b6 = _mm256_shuffle_ps(a5, a7, _MM_SHUFFLE(1, 0, 1, 0)); // 20 21 22 h | 52 53 54 h
344 //auto b7 = _mm256_shuffle_ps(a5, a7, _MM_SHUFFLE(3, 2, 3, 2)); // 28 29 30 h | 59 60 61 h
345
346
347 _mm256_store_ps((float*)vertices + 8 * (v + 1), _mm256_permute2f128_ps(b1, b5, 0x20));
348 _mm256_store_ps((float*)vertices + 8 * (v + 2), _mm256_permute2f128_ps(b2, b6, 0x20));
349 _mm256_store_ps((float*)vertices + 8 * (v + 3), _mm256_permute2f128_ps(b3, b7, 0x20));
350 _mm256_store_ps((float*)vertices + 8 * (v + 5), _mm256_permute2f128_ps(b1, b5, 0x31));
351 _mm256_store_ps((float*)vertices + 8 * (v + 6), _mm256_permute2f128_ps(b2, b6, 0x31));
352 _mm256_store_ps((float*)vertices + 8 * (v + 7), _mm256_permute2f128_ps(b3, b7, 0x31));
353 }
354 else {
355 for (size_t ll = 0; ll < 8 && v + ll < vn; ll++) {
356 vertices[v + ll].pos.x = px.m256_f32[ll];
357 vertices[v + ll].pos.y = py.m256_f32[ll];
358 vertices[v + ll].pos.z = pz.m256_f32[ll];
359 vertices[v + ll].nrm.x = gx.m256_f32[ll];
360 vertices[v + ll].nrm.y = gy.m256_f32[ll];
361 vertices[v + ll].nrm.z = gz.m256_f32[ll];
362 vertices[v + ll].tex.x = tp.m256_f32[ll];
363 vertices[v + ll].tex.y = 0.5f;
364 }
365 }
366
367 }
368 //assert(vix == vo);
369 return vn;
370 }
371 //#pragma optimize( "", on )
372
373 unsigned indexExtract(uint32_t* sub_mesh,
374 Idx* indices,
375 uint8_t* tmp,
376 const size3_t gridLayout,
377 const size_t T_n,
378 const size_t /*indexCount*/,
379 const size_t vertexCount)
380 {
381 const size3_t cellLayout = { gridLayout.x - 1, gridLayout.y - 1, gridLayout.z - 1 };
382 const auto * axesTable = MarchingCubes::axesTable().data();
383 const auto * indexTable = MarchingCubes::indexTable().data();
384
385 unsigned iix = 0;
386 for (unsigned l = 0; l < T_n; l++) {
387 auto * offsets = (uint32_t*)(tmp + gridLayout.x*gridLayout.y*gridLayout.z * sizeof(uint32_t)*l);
388 auto * codes = tmp + gridLayout.x*gridLayout.y*gridLayout.z*(sizeof(uint32_t)*T_n + l);
389 auto * c = codes;
390 for (unsigned k = 0; k < cellLayout.z; k++) {
391 for (unsigned j = 0; j < cellLayout.y; j++) {
392 for (unsigned i = 0; i < cellLayout.x; i++) {
393 auto code = *c++;
394 if (code == 0) continue;
395
396 auto * axisShifts = indexTable + 16 * code;
397 while (true)
398 {
399 auto axisShift = *axisShifts++;
400 if (axisShift == 255) break;
401
402 auto ii = i + (axisShift & 8 ? 1 : 0);
403 auto jj = j + (axisShift & 16 ? 1 : 0);
404 auto kk = k + (axisShift & 32 ? 1 : 0);
405
406 auto shiftedCell = (kk*gridLayout.y + jj)*gridLayout.x + ii;
407
408 const auto shiftedCode = codes[shiftedCell];
409 const auto axes = axesTable[shiftedCode] & axisShift;
410
411 const auto ix = offsets[shiftedCell] + ((axes & 4) ? 1 : 0) + ((axes & 2) ? 1 : 0);
412 assert(ix < vertexCount);
413
414 assert(sizeof(Idx)!=2 || ix < 0xffff);
415 indices[iix++] = (Idx)ix;
416 }
417 }
418
419 c++;
420 }
421 c += gridLayout.x;
422 }
423 sub_mesh[l+1] = iix;
424 }
425
426 return iix;
427 }
428
429//#pragma optimize( "", off)
430 unsigned indexExtractAVX2(uint32_t* indices,
431 uint8_t* tmp,
432 unsigned gridLayoutL2_x,
433 unsigned gridLayoutL2_y,
434 unsigned gridLayout_z,
435 const unsigned T_n,
436 const unsigned indexCount,
437 const unsigned maxVertex)
438 {
439 const unsigned gridLayout_x = 1 << gridLayoutL2_x;
440 const unsigned gridLayout_y = 1 << gridLayoutL2_y;
441 //const unsigned cellLayout_x = gridLayout_x - 1;
442 //const unsigned cellLayout_y = gridLayout_y - 1;
443 //const unsigned cellLayout_z = gridLayout_z - 1;
444
445 const auto * axesTable = MarchingCubes::axesTable().data();
446
447 const auto * idxwork = (uint32_t*)(tmp + gridLayout_x*gridLayout_y*gridLayout_z*((sizeof(uint32_t) + sizeof(uint8_t) + 3 * sizeof(uint32_t))*T_n));
448 const auto * codes = tmp + gridLayout_x*gridLayout_y*gridLayout_z*(sizeof(uint32_t)*T_n);
449 const auto * offsets = (uint32_t*)tmp;
450
451 const auto one_epi32 = _mm256_set1_epi32(1);
452 const auto three_epi32 = _mm256_set1_epi32(3);
453
454
455 //const auto gridL2_3_epi64 = _mm_set1_epi64x(gridLayoutL2_x + gridLayoutL2_y + gridLayoutL2_z);
456 const auto grid_3_epi32 = _mm256_set1_epi32(gridLayout_x*gridLayout_y*gridLayout_z);
457
458 const auto X_epi32 = _mm256_set1_epi32(gridLayout_x);
459 const auto XY_epi32 = _mm256_set1_epi32(gridLayout_x*gridLayout_y);
460 const auto mask_255_epi32 = _mm256_set1_epi32(255);
461 for (unsigned iix = 0; iix < indexCount; iix += 8) {
462 const auto w = _mm256_load_si256((const __m256i*)(idxwork + iix));
463 const auto o0 = _mm256_srli_epi32(w, 8);
464
465 // (((w >> 5) & 1), (((w >> 6) & 1) ? gridLayout.x : 0), (((w >> 7) & 1) ? gridLayout.x*gridLayout.y : 0))
466 const auto shiftI = _mm256_and_si256(_mm256_srli_epi32(w, 5), one_epi32);
467 const auto shiftJ = _mm256_and_si256(_mm256_cmpeq_epi32(_mm256_and_si256(_mm256_srli_epi32(w, 6), one_epi32), one_epi32), X_epi32);
468 const auto shiftK = _mm256_and_si256(_mm256_cmpeq_epi32(_mm256_and_si256(_mm256_srli_epi32(w, 7), one_epi32), one_epi32), XY_epi32);
469
470 const auto shiftedCell = _mm256_add_epi32(_mm256_add_epi32(o0, shiftI),
471 _mm256_add_epi32(shiftJ, shiftK));
472
473 //const auto L = _mm256_sll_epi32(_mm256_and_si256(w, three_epi32), gridL2_3_epi64); // l << (gridLayoutL2.x + gridLayoutL2.y + gridLayoutL2.z)
474
475 const auto L = _mm256_mullo_epi32(grid_3_epi32, _mm256_and_si256(w, three_epi32));
476
477 const auto I = _mm256_add_epi32(shiftedCell, L);
478
479 const auto shiftedCode = _mm256_and_si256(_mm256_i32gather_epi32((const int*)codes, I, 1), mask_255_epi32);
480 const auto axisShift = _mm256_srli_epi32(w, 2);
481 const auto axes = _mm256_and_si256(_mm256_i32gather_epi32((const int*)axesTable, shiftedCode, 1), axisShift);
482
483 const auto addJ = _mm256_and_si256(_mm256_srli_epi32(axes, 1), one_epi32);
484 const auto addK = _mm256_and_si256(_mm256_srli_epi32(axes, 2), one_epi32);
485 const auto offset = _mm256_add_epi32(_mm256_i32gather_epi32((const int*)offsets, I, 4),
486 _mm256_add_epi32(addJ, addK));
487
488 for (unsigned i = 0; i < 8; i++) {
489 assert(offset.m256i_u32[i] < maxVertex);
490 }
491 _mm256_storeu_si256((__m256i*)(indices + iix), offset);
492 }
493
494 return indexCount;
495 }
496 //#pragma optimize( "", on )
497
498
499}
500
501namespace Cogs::Core::EchoSounder {
502
503
504//#pragma optimize( "", off )
505 MeshHandle createIsoSurfacesSplitAVX2(Context* context,
506 uint64_t& analyze,
507 uint64_t& vtx,
508 uint64_t& idx,
510 const float* values,
511 const glm::uvec3 gridLayout_,
512 const glm::vec3 /*minCorner*/,
513 const glm::vec3 /*maxCorner*/,
514 const float *thresholds, size_t count)
515 {
516 size3_t gridLayout = { gridLayout_.x, gridLayout_.y, gridLayout_.z };
517 size3_t gridLayoutL2 = {
518 static_cast<size_t>(log2(gridLayout_.x)),
519 static_cast<size_t>(log2(gridLayout_.y)),
520 static_cast<size_t>(log2(gridLayout_.z))
521 };
522 assert(gridLayout.x != 0 && gridLayout.y != 0 && gridLayout.z != 0);
523 assert((size_t(1) << gridLayoutL2.x) == gridLayout.x &&
524 (size_t(1) << gridLayoutL2.y) == gridLayout.y &&
525 (size_t(1) << gridLayoutL2.z) == gridLayout.z);
526
527 const size_t T_n = count;
528
529 // !!!!! This is where you set number of parts !!!!!
530 const size_t parts = 2;
531 assert((parts < gridLayout.z - 1) && "Parts must be at least a slice thick");
532
533 const size_t partSpacing = (gridLayout.z - 2) / parts;
534 const size_t partSize = partSpacing + 1;
535 assert(1 <= partSpacing);
536 assert(parts*partSpacing + 1 < gridLayout.z);
537 assert(gridLayout.z <= (parts + 1)*partSpacing);
538
539
540 size_t partScratchSize = (sizeof(uint32_t) + sizeof(uint8_t) + 3 * sizeof(uint32_t) + 15 * sizeof(uint32_t))*T_n*gridLayout.x*gridLayout.y*partSize;
541 scratch.resize(parts * partScratchSize, false);
542 auto * tmp = (uint8_t*)scratch.data();
543 //std::memset(tmp, -1, scratch.size());
544
545 unsigned occupiedCells = 0;
546 unsigned indexCount = 0;
547 unsigned vertexCount = 0;
548 std::vector<unsigned> partVertexOffset(parts + 1);
549 std::vector<unsigned> partIndexOffset(parts + 1);
550
551 // Analyze pass
552 auto timer = Timer::startNew();
553 for (size_t p = 0; p < parts; p++) {
554 size_t slice_a = partSpacing * p;
555 size_t slice_b = std::min(slice_a + partSize, gridLayout.z);
556
557 partVertexOffset[p] = vertexCount;
558 partIndexOffset[p] = indexCount;
559 analyzeAVX2(tmp + partScratchSize * p, vertexCount, occupiedCells, indexCount,
560 values + slice_a * gridLayout.x*gridLayout.y,
561 gridLayoutL2.x, gridLayoutL2.y, slice_b - slice_a,
562 T_n, thresholds);
563 }
564 partVertexOffset[parts] = vertexCount;
565 partIndexOffset[parts] = indexCount;
566 analyze = timer.elapsedMicroseconds();
567
568 if (occupiedCells == 0) {
569 vtx = 0;
570 idx = 0;
572 }
573
574 auto mesh = context->meshManager->createLocked();
575 //mesh->setBounds(Cogs::Geometry::BoundingBox{ glm::vec3(0,0,0), glm::vec3(gridLayout.x, gridLayout.y, gridLayout.z) });
576 mesh->setMeshFlag(MeshFlags::ClockwiseWinding);
577 mesh->primitiveType = PrimitiveType::TriangleList;
578 auto vertices = mesh->map<Vertex>(VertexDataType::Interleaved0, VertexFormats::Pos3fNorm3fTex2f, vertexCount);
579
580 // Extract vertices
581 timer = Timer::startNew();
582 for (size_t p = 0; p < parts; p++) {
583 size_t vn = partVertexOffset[p + 1] - partVertexOffset[p];
584 if (vn == 0) continue;
585
586 size_t slice_a = partSpacing * p;
587 size_t slice_b = std::min(slice_a + partSize, gridLayout.z);
588 vertexExtractAVX2(vertices.data + partVertexOffset[p], // assumed to be 32b aligned since we use aligned stores.
589 tmp + partScratchSize * p,
590 values + slice_a * gridLayout.x*gridLayout.y,
591 gridLayoutL2.x, gridLayoutL2.y, slice_b - slice_a,
592 T_n, thresholds, vn);
593 }
594 vtx = timer.elapsedMicroseconds();
595
596 // extract indices
597 // std::vector<uint32_t> indices(indexCount+8);
598 // timer = Timer::startNew();
599 // for (unsigned p = 0; p < parts; p++) {
600 // auto ixn = partIndexOffset[p + 1] - partIndexOffset[p];
601 // if (ixn == 0) continue;
602 // unsigned slice_a = partSpacing * p;
603 // unsigned slice_b = std::min(slice_a + partSize, gridLayout.z);
604 // indexExtractAVX2(indices.data() + partIndexOffset[p],
605 // tmp + partScratchSize * p,
606 // gridLayoutL2.x, gridLayoutL2.y, slice_b - slice_a,
607 // T_n, ixn, partVertexOffset[p + 1]);
608 // }
609 // idx = timer.elapsedMicroseconds();
610
611 std::vector<uint32_t> sub_mesh(T_n+1, 0);
612 mesh->clearIndexes();
613 Idx *indices = (Idx*)mesh->mapStream(VertexDataType::Indexes, 0, indexCount, sizeof(Idx), true);
614 timer = Timer::startNew();
615 auto iix = indexExtract(sub_mesh.data(), indices, tmp, gridLayout, T_n, indexCount, vertexCount);
616 assert(iix == indexCount);
617 mesh->setMeshFlag(MeshFlags::Indexed);
618 mesh->setMeshFlag(MeshFlags::IndexesChanged);
619 idx = timer.elapsedMicroseconds();
620
621 auto subMeshes = mesh->mapSubMeshes(static_cast<uint32_t>(T_n));
622 for (size_t l = 0; l < T_n; l++) {
623 uint32_t start = sub_mesh[l];
624 uint32_t size = sub_mesh[l+1]-sub_mesh[l];
625 subMeshes[l] = {start, size, PrimitiveType::TriangleList};
626 }
627 mesh->setCount(indexCount);
628
629 auto box = Geometry::makeEmptyBoundingBox<Geometry::BoundingBox>();
630 for(unsigned i=0; i<vertexCount; i++){
631 glm::vec3 pos = vertices[i].pos;
632 box.min = glm::min(box.min, pos);
633 box.max = glm::max(box.max, pos);
634 }
635 mesh->setBounds(box);
636
637 _mm256_zeroupper();
638 return mesh.getHandle();
639 }
640 //#pragma optimize( "", on )
641
642}
A Context instance contains all the services, systems and runtime components needed to use Cogs.
Definition: Context.h:83
Log implementation class.
Definition: LogManager.h:139
Contains the Engine, Renderer, resource managers and other systems needed to run Cogs....
constexpr Log getLogger(const char(&name)[LEN]) noexcept
Definition: LogManager.h:180
@ IndexesChanged
The index data of the mesh changed.
Definition: Mesh.h:61
@ Indexed
The mesh should be drawn indexed, using index data to order the triangle vertexes.
Definition: Mesh.h:65
@ ClockwiseWinding
The mesh uses clockwise winding order for it's triangles as opposed to the counter-clockwise default.
Definition: Mesh.h:63
static const ResourceHandle_t NoHandle
Handle representing a default (or none if default not present) resource.
@ TriangleList
List of triangles.
Definition: Common.h:116