Cogs.Core
UniformGridSystem_isosurf_avx2.cpp
1
2#include "Resources/Resources.h"
3#include "Resources/MeshManager.h"
4#include "Resources/VertexFormats.h"
5
6#include "Context.h"
7
8#include "../../../IsoSurfaces/MarchingCubesTables.h"
9
10#include "Foundation/Logging/Logger.h"
11#include "Foundation/Memory/MemoryBuffer.h"
12#include "Foundation/Platform/Timer.h"
13
14#include <glm/glm.hpp>
15
16namespace {
17 using namespace Cogs::Core;
18
19 Cogs::Logging::Log logger = Cogs::Logging::getLogger("UniformGridSystem");
20
21 struct Vertex
22 {
23 glm::vec3 pos;
24 glm::vec3 nrm;
25 glm::vec2 tex;
26 };
27 typedef uint16_t Idx;
28
29 struct size3_t { size_t x, y, z; };
30
31//#pragma optimize( "", off )
32 void analyzeAVX2(uint8_t* tmp,
33 unsigned& vertexCount_,
34 unsigned& occupiedCells_,
35 unsigned& indexCount_,
36 const float* values,
37 const glm::uvec3 gridLayout_,
38 const size_t T_n,
39 const float* thresholds)
40 {
41 size3_t gridLayout = { gridLayout_.x, gridLayout_.y, gridLayout_.z };
42
43 assert((gridLayout.x & 7) == 0 && (gridLayout.y & 7) == 0 && (gridLayout.z & 7) == 0);
44 assert((gridLayout.y*gridLayout.x & 31) == 0);
45 assert(gridLayout.x <= 256 && "x must fit inside a byte");
46 size3_t cellLayout = { gridLayout.x - 1, gridLayout.y - 1, gridLayout.z - 1 };
47 size3_t gridLayoutL2 = {
48 static_cast<size_t>(log2(gridLayout_.x)),
49 static_cast<size_t>(log2(gridLayout_.y)),
50 static_cast<size_t>(log2(gridLayout_.z))
51 };
52 assert((size_t(1) << gridLayoutL2.x) == gridLayout.x &&
53 (size_t(1) << gridLayoutL2.y) == gridLayout.y &&
54 (size_t(1) << gridLayoutL2.z) == gridLayout.z);
55
56 const auto * axesTable = MarchingCubes::axesTable().data();
57 const auto * indexCountTable = MarchingCubes::indexCountTable().data();
58 const auto * indexTable = MarchingCubes::indexTable().data();
59
60 auto * vtxwork = (uint32_t*)(tmp + gridLayout.x*gridLayout.y*gridLayout.z*((sizeof(uint32_t) + sizeof(uint8_t))*T_n));
61 auto * idxwork = (uint32_t*)(tmp + gridLayout.x*gridLayout.y*gridLayout.z*((sizeof(uint32_t) + sizeof(uint8_t) + 3 * sizeof(uint32_t))*T_n));
62
63 unsigned vertexCount = 0;
64 unsigned occupiedCells = 0;
65 unsigned indexCount = 0;
66 for (unsigned l = 0; l < T_n; l++) {
67 auto * offsets = (uint32_t*)(tmp + gridLayout.x*gridLayout.y*gridLayout.z * sizeof(uint32_t)*l);
68 auto * codes = (tmp + gridLayout.x*gridLayout.y*gridLayout.z*(sizeof(uint32_t)*T_n + l));
69 auto T = _mm256_set1_ps(thresholds[l]);
70 {
71 auto M = _mm256_set1_epi32(1);
72 auto * c = codes;
73 for (unsigned k = 0; k < gridLayout.z*gridLayout.y*gridLayout.x; k += 8) {
74 auto t0 = _mm256_load_ps(values + k);
75 auto t1 = _mm256_and_si256(M, _mm256_castps_si256(_mm256_cmp_ps(t0, T, _CMP_LT_OQ)));
76 auto t2 = _mm256_extracti128_si256(t1, 1);
77 auto t3 = _mm256_extracti128_si256(t1, 0);
78 auto t4 = _mm_packus_epi32(t3, t2);
79 auto t5 = _mm_packus_epi16(t4, _mm_setzero_si128());
80 _mm_storel_epi64((__m128i*)c, t5);
81 c += 8;
82 }
83 }
84 { // merge across slices
85 auto * c = (__m256i*)codes;
86 const auto M = (gridLayout.y*gridLayout.x) / sizeof(__m256i);
87 for (unsigned k = 0; k < cellLayout.z; k++) {
88 for (unsigned j = 0; j < M; j++) {
89 auto t0 = _mm256_load_si256((__m256i*)c);
90 auto t1 = _mm256_load_si256((__m256i*)(c + M));
91 auto t2 = _mm256_slli_epi32(t1, 4);
92 auto t3 = _mm256_or_si256(t0, t2);
93 _mm256_store_si256(c++, t3);
94 }
95 }
96 for (unsigned j = 0; j < M; j++) {
97 auto t0 = _mm256_load_si256((__m256i*)c);
98 auto t2 = _mm256_slli_epi32(t0, 4);
99 auto t3 = _mm256_or_si256(t0, t2);
100 _mm256_store_si256(c++, t3);
101 }
102 }
103 {
104 auto mask_i_epi8 = _mm256_set1_epi8(static_cast<char>(gridLayout.x - 1));
105 auto mask_j_epi8 = _mm256_set1_epi8(static_cast<char>(gridLayout.y - 1));
106 auto stride = _mm256_set1_epi8(32);
107 auto ki = _mm256_set_epi8(31, 30, 29, 28,
108 27, 26, 25, 24,
109 23, 22, 21, 20,
110 19, 18, 17, 16,
111 15, 14, 13, 12,
112 11, 10, 9, 8,
113 7, 6, 5, 4,
114 3, 2, 1, 0);
115 auto kj = _mm256_set_epi8(31 >> gridLayoutL2.x, 30 >> gridLayoutL2.x, 29 >> gridLayoutL2.x, 28 >> gridLayoutL2.x,
116 27 >> gridLayoutL2.x, 26 >> gridLayoutL2.x, 25 >> gridLayoutL2.x, 24 >> gridLayoutL2.x,
117 23 >> gridLayoutL2.x, 22 >> gridLayoutL2.x, 21 >> gridLayoutL2.x, 20 >> gridLayoutL2.x,
118 19 >> gridLayoutL2.x, 18 >> gridLayoutL2.x, 17 >> gridLayoutL2.x, 16 >> gridLayoutL2.x,
119 15 >> gridLayoutL2.x, 14 >> gridLayoutL2.x, 13 >> gridLayoutL2.x, 12 >> gridLayoutL2.x,
120 11 >> gridLayoutL2.x, 10 >> gridLayoutL2.x, 9 >> gridLayoutL2.x, 8 >> gridLayoutL2.x,
121 7 >> gridLayoutL2.x, 6 >> gridLayoutL2.x, 5 >> gridLayoutL2.x, 4 >> gridLayoutL2.x,
122 3 >> gridLayoutL2.x, 2 >> gridLayoutL2.x, 1 >> gridLayoutL2.x, 0 >> gridLayoutL2.x);
123 auto ones = _mm256_set1_epi32(~0);
124
125 //const auto mask_i = (1 << gridLayoutL2.x) - 1;
126 //const auto mask_j = ((1 << gridLayoutL2.y) - 1) << gridLayoutL2.x;
127 //const auto mask_k = ((1 << gridLayoutL2.z) - 1) << (gridLayoutL2.x + gridLayoutL2.y);
128
129 for (unsigned i = 0; i < gridLayout.z*gridLayout.y*gridLayout.x; i += 32) {
130 auto skirt_i = _mm256_cmpeq_epi8(_mm256_and_si256(ki, mask_i_epi8), mask_i_epi8);
131 ki = _mm256_add_epi8(ki, stride);
132
133 // corner 0,0,0 and corner 1,0,0
134 auto t0 = _mm256_load_si256((__m256i*)(codes + i));
135 auto t1 = _mm256_loadu_si256((__m256i*)(codes + i + 1));
136 auto t2 = _mm256_blendv_epi8(t1, t0, skirt_i);
137 auto t3 = _mm256_or_si256(t0, _mm256_slli_epi32(t2, 1));
138
139 auto row = _mm256_add_epi8(_mm256_set1_epi8(static_cast<char>(i >> gridLayoutL2.x)), kj);
140 auto skirt_j = _mm256_cmpeq_epi8(_mm256_and_si256(row, mask_j_epi8), mask_j_epi8);
141
142 // corner 0,1,0 and corner 1,1,0
143 auto r0 = _mm256_load_si256((__m256i*)(codes + gridLayout.x + i));
144 auto r1 = _mm256_loadu_si256((__m256i*)(codes + gridLayout.x + i + 1));
145 auto r2 = _mm256_blendv_epi8(r1, r0, skirt_i);
146 auto r3 = _mm256_or_si256(r0, _mm256_slli_epi32(r2, 1));
147 auto r4 = _mm256_blendv_epi8(r3, t3, skirt_j);
148
149 auto t5 = _mm256_or_si256(t3, _mm256_slli_epi32(r4, 2)); // merge
150
151 auto ones_ = _mm256_cmpeq_epi8(_mm256_setzero_si256(), _mm256_setzero_si256());
152 auto m2 = _mm256_cmpeq_epi8(t5, ones_);
153 auto code_epi8 = _mm256_andnot_si256(m2, t5);
154 _mm256_store_si256((__m256i*)(codes + i), code_epi8);
155
156 auto m3 = _mm256_cmpeq_epi8(code_epi8, _mm256_setzero_si256());
157 uint32_t zerolanes = _mm256_movemask_epi8(m3);
158 if (zerolanes == 0xffffffff) continue;
159
160 auto skirt_k = _mm256_set1_epi8((i >> (gridLayoutL2.x + gridLayoutL2.y)) == ((1u << gridLayoutL2.z) - 1u) ? 255 : 0);
161 auto skirt = _mm256_or_si256(_mm256_or_si256(skirt_i, skirt_j), skirt_k);
162
163 for (unsigned r = 0; r < 32; r++) {
164 if(m3.m256i_u8[r]==0) {
165 auto code = code_epi8.m256i_u8[r];
166 offsets[i + r] = vertexCount;
167 auto k = ((i + r) << 8) | l;
168
169 auto axes = axesTable[code];
170 if (axes & 4) {
171 vtxwork[vertexCount++] = k | (4 << 5);
172 }
173 if (axes & 2) {
174 vtxwork[vertexCount++] = k | (2 << 5);
175 }
176 if (axes & 1) {
177 vtxwork[vertexCount++] = k | (1 << 5);
178 }
179 if(skirt.m256i_u8[r] == 0) {
180 occupiedCells++;
181 auto Ni = indexCountTable[code];
182 auto * as = indexTable + 16 * size_t(code);
183 for (unsigned ll = 0; ll < Ni; ll+=3) {
184 idxwork[indexCount++] = k | (as[ll + 0] << 2);
185 idxwork[indexCount++] = k | (as[ll + 1] << 2);
186 idxwork[indexCount++] = k | (as[ll + 2] << 2);
187 }
188 }
189 }
190 }
191 }
192 }
193 }
194
195 if (auto t = vertexCount; t) {
196 while ((t & 7)) vtxwork[t++] = vtxwork[t - 1];
197 }
198 if (auto t = indexCount; t) {
199 while (t & 7) idxwork[t++] = idxwork[indexCount - 1];
200 }
201
202 vertexCount_ = vertexCount;
203 occupiedCells_ = occupiedCells;
204 indexCount_ = indexCount;
205 }
206 //#pragma optimize( "", on )
207
208
209 //#pragma optimize( "", off )
210 unsigned vertexExtractAVX2(Cogs::Core::MappedStream<Vertex>& vertices,
211 uint8_t* tmp,
212 const float* values,
213 const glm::uvec3 gridLayout_,
214 const unsigned T_n,
215 const float* thresholds,
216 unsigned vn)
217 {
218 size3_t gridLayout = { gridLayout_.x, gridLayout_.y, gridLayout_.z };
219 size3_t cellLayout = { gridLayout.x - 1, gridLayout.y - 1, gridLayout.z - 1 };
220
221 size3_t gridLayoutL2 = {
222 static_cast<size_t>(log2(gridLayout_.x)),
223 static_cast<size_t>(log2(gridLayout_.y)),
224 static_cast<size_t>(log2(gridLayout_.z))
225 };
226 assert((size_t(1) << gridLayoutL2.x) == gridLayout.x &&
227 (size_t(1) << gridLayoutL2.y) == gridLayout.y &&
228 (size_t(1) << gridLayoutL2.z) == gridLayout.z);
229
230
231 auto * work = (uint32_t*)(tmp + gridLayout.x*gridLayout.y*gridLayout.z*((sizeof(uint32_t) + sizeof(uint8_t))*T_n));
232
233 //const auto mask_i = _mm256_set1_epi32(gridLayout.x - 1);
234 const auto mask_j = _mm256_set1_epi32(static_cast<int>(gridLayout.y - 1));
235 const auto mask_k = _mm256_set1_epi32(static_cast<int>(gridLayout.z - 1));
236 const auto X_L2_epi64 = _mm_insert_epi64(_mm_undefined_si128(), gridLayoutL2.x, 0);
237 const auto XY_L2_epi64 = _mm_insert_epi64(_mm_undefined_si128(), gridLayoutL2.x + gridLayoutL2.y, 0);
238 const auto oneOverTn = _mm256_set1_ps(1.f / T_n);
239
240 const auto one = _mm256_set1_epi32(1);
241 const auto X = _mm256_set1_epi32(static_cast<int>(gridLayout.x));
242 const auto XY = _mm256_set1_epi32(static_cast<int>(gridLayout.x*gridLayout.y));
243 const auto C3 = _mm256_setr_ps(thresholds[0], thresholds[1], thresholds[2], thresholds[3],
244 thresholds[0], thresholds[1], thresholds[2], thresholds[3]);
245 for (size_t v = 0; v < vn; v += 8) {
246 auto w = _mm256_loadu_si256((__m256i*)(work + v));
247 auto o0 = _mm256_srli_epi32(w, 8);
248 auto a = _mm256_i32gather_ps(values, o0, 4);
249
250 // Extract i,j,k of sample point 0
251 auto mask_i = _mm256_sub_epi32(X, one);
252 auto i = _mm256_and_si256(o0, mask_i);
253 auto j = _mm256_and_si256(_mm256_srl_epi32(o0, X_L2_epi64), mask_j);
254 auto k = _mm256_and_si256(_mm256_srl_epi32(o0, XY_L2_epi64), mask_k);
255
256 // mask set if i < gridLayout.x-1 => i+1 is safe access.
257 auto i0_less = _mm256_cmpgt_epi32(mask_i, i);
258 auto j0_less = _mm256_cmpgt_epi32(mask_j, j);
259 auto k0_less = _mm256_cmpgt_epi32(mask_k, k);
260
261 // Gather and calc discrete differences at first point
262 auto ax = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o0, _mm256_and_si256(i0_less, one)), 4), a);
263 auto ay = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o0, _mm256_and_si256(j0_less, X)), 4), a);
264 auto az = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o0, _mm256_and_si256(k0_less, XY)), 4), a);
265
266 auto di = _mm256_and_si256(_mm256_srli_epi32(w, 5), one); // 1 if intersection is on x-axis
267 auto dj = _mm256_and_si256(_mm256_srli_epi32(w, 6), one);
268 auto dk = _mm256_and_si256(_mm256_srli_epi32(w, 7), one);
269
270 // All-set mask if intersection is on x-axis.
271 auto mx = _mm256_cmpeq_epi32(di, one);
272 auto my = _mm256_cmpeq_epi32(dj, one);
273 auto mz = _mm256_cmpeq_epi32(dk, one);
274
275 // FOrm index of sample point 1
276 auto o1 = _mm256_add_epi32(_mm256_add_epi32(o0, di),
277 _mm256_add_epi32(_mm256_and_si256(my, X),
278 _mm256_and_si256(mz, XY)));
279 auto b = _mm256_i32gather_ps(values, o1, 4);
280
281 // mask set if k + dk < gridLayout.z-1 => dk+1 is safe access.
282 auto i1_less = _mm256_cmpgt_epi32(mask_i, _mm256_add_epi32(i, di));
283 auto j1_less = _mm256_cmpgt_epi32(mask_j, _mm256_add_epi32(j, dj));
284 auto k1_less = _mm256_cmpgt_epi32(mask_k, _mm256_add_epi32(k, dk));
285
286 // Gather and calc discrete differences at second point
287 auto bx = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o1, _mm256_and_si256(i1_less, one)), 4), b);
288 auto by = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o1, _mm256_and_si256(j1_less, X)), 4), b);
289 auto bz = _mm256_sub_ps(_mm256_i32gather_ps(values, _mm256_add_epi32(o1, _mm256_and_si256(k1_less, XY)), 4), b);
290
291 auto l = _mm256_and_si256(w, _mm256_set1_epi32((1 << 5) - 1));
292 auto T = _mm256_permutevar_ps(C3, l);
293 auto t = _mm256_mul_ps(_mm256_sub_ps(T, a), _mm256_rcp_ps(_mm256_sub_ps(b, a)));
294
295 // Interpolate position
296 auto px = _mm256_add_ps(_mm256_cvtepi32_ps(i), _mm256_and_ps(_mm256_castsi256_ps(mx), t));
297 auto py = _mm256_add_ps(_mm256_cvtepi32_ps(j), _mm256_and_ps(_mm256_castsi256_ps(my), t));
298 auto pz = _mm256_add_ps(_mm256_cvtepi32_ps(k), _mm256_and_ps(_mm256_castsi256_ps(mz), t));
299
300 // Form negative of interpolated gradient (-a + t(a-b))
301 auto gx = _mm256_sub_ps(_mm256_mul_ps(t, _mm256_sub_ps(ax, bx)), ax);
302 auto gy = _mm256_sub_ps(_mm256_mul_ps(t, _mm256_sub_ps(ay, by)), ay);
303 auto gz = _mm256_sub_ps(_mm256_mul_ps(t, _mm256_sub_ps(az, bz)), az);
304
305 auto half = _mm256_set1_ps(0.5f);
306 auto tp = _mm256_mul_ps(oneOverTn, _mm256_add_ps(_mm256_cvtepi32_ps(l), half));
307
308 if (v + 8 <= vn) {
309 //px = _mm256_setr_ps( 0, 8, 16, 24, 32, 40, 48, 55);
310 //py = _mm256_setr_ps( 1, 9, 17, 25, 33, 41, 49, 56);
311 //pz = _mm256_setr_ps( 2, 10, 18, 26, 34, 42, 50, 57);
312 //gx = _mm256_setr_ps( 3, 11, 19, 27, 35, 43, 51, 58);
313 //gy = _mm256_setr_ps( 4, 12, 20, 28, 36, 44, 52, 59);
314 //gz = _mm256_setr_ps( 5, 13, 21, 29, 37, 45, 53, 60);
315 //tp = _mm256_setr_ps( 6, 14, 22, 30, 38, 46, 54, 61);
316
317 auto a0 = _mm256_unpacklo_ps(px, py); // 0 1 8 9 | 32 33 40 41
318 auto a2 = _mm256_unpacklo_ps(pz, gx); // 2 3 10 11 | 34 35 42 43
319 auto a4 = _mm256_unpacklo_ps(gy, gz); // 4 5 12 13 | 36 37 44 45
320 auto a6 = _mm256_unpacklo_ps(tp, half); // 5 h 14 h | 38 h 46 h
321
322 auto b0_ = _mm256_shuffle_ps(a0, a2, _MM_SHUFFLE(1, 0, 3, 2)); // 8 9 2 3 | 40 41 24 25
323 auto b0 = _mm256_blend_ps(a0, b0_, 0xCC); // = 11001100 // 0 1 2 3 | 32 33 34 35
324 auto b1 = _mm256_blend_ps(a2, b0_, 0x33); // = 00110011 // 8 9 10 11 | 40 41 42 43
325
326 //auto b0 = _mm256_shuffle_ps(a0, a2, _MM_SHUFFLE(1, 0, 1, 0)); // 0 1 2 3 | 32 33 34 35
327 //auto b1 = _mm256_shuffle_ps(a0, a2, _MM_SHUFFLE(3, 2, 3, 2)); // 8 9 10 11 | 40 41 42 43
328
329 auto b4_ = _mm256_shuffle_ps(a4, a6, _MM_SHUFFLE(1, 0, 3, 2));
330 auto b4 = _mm256_blend_ps(a4, b4_, 0xCC);
331 auto b5 = _mm256_blend_ps(a6, b4_, 0x33);
332 //auto b4 = _mm256_shuffle_ps(a4, a6, _MM_SHUFFLE(1, 0, 1, 0)); // 4 5 6 h | 36 37 38 h
333 //auto b5 = _mm256_shuffle_ps(a4, a6, _MM_SHUFFLE(3, 2, 3, 2)); // 12 13 14 h | 44 45 46 h
334
335 _mm256_store_ps((float*)vertices.data + 8 * (v + 0), _mm256_permute2f128_ps(b0, b4, 0x20));
336 _mm256_store_ps((float*)vertices.data + 8 * (v + 4), _mm256_permute2f128_ps(b0, b4, 0x31));
337
338 auto a1 = _mm256_unpackhi_ps(px, py); // 16 17 24 25 | 48 49 55 56
339 auto a3 = _mm256_unpackhi_ps(pz, gx); // 18 19 26 27 | 50 51 57 58
340
341 auto b2_ = _mm256_shuffle_ps(a1, a3, _MM_SHUFFLE(1, 0, 3, 2));
342 auto b2 = _mm256_blend_ps(a1, b2_, 0xCC);
343 auto b3 = _mm256_blend_ps(a3, b2_, 0x33);
344 //auto b2 = _mm256_shuffle_ps(a1, a3, _MM_SHUFFLE(1, 0, 1, 0)); // 16 17 18 19 | 48 49 50 51
345 //auto b3 = _mm256_shuffle_ps(a1, a3, _MM_SHUFFLE(3, 2, 3, 2)); // 24 25 26 27 | 55 56 57 58
346
347 auto a5 = _mm256_unpackhi_ps(gy, gz); // 20 21 28 29 | 52 53 59 60
348 auto a7 = _mm256_unpackhi_ps(tp, half); // 22 h 30 h | 54 h 61 h
349
350 auto b6_ = _mm256_shuffle_ps(a5, a7, _MM_SHUFFLE(1, 0, 3, 2));
351 auto b6 = _mm256_blend_ps(a5, b6_, 0xCC);
352 auto b7 = _mm256_blend_ps(a7, b6_, 0x33);
353 //auto b6 = _mm256_shuffle_ps(a5, a7, _MM_SHUFFLE(1, 0, 1, 0)); // 20 21 22 h | 52 53 54 h
354 //auto b7 = _mm256_shuffle_ps(a5, a7, _MM_SHUFFLE(3, 2, 3, 2)); // 28 29 30 h | 59 60 61 h
355
356
357 _mm256_store_ps((float*)vertices.data + 8 * (v + 1), _mm256_permute2f128_ps(b1, b5, 0x20));
358 _mm256_store_ps((float*)vertices.data + 8 * (v + 2), _mm256_permute2f128_ps(b2, b6, 0x20));
359 _mm256_store_ps((float*)vertices.data + 8 * (v + 3), _mm256_permute2f128_ps(b3, b7, 0x20));
360 _mm256_store_ps((float*)vertices.data + 8 * (v + 5), _mm256_permute2f128_ps(b1, b5, 0x31));
361 _mm256_store_ps((float*)vertices.data + 8 * (v + 6), _mm256_permute2f128_ps(b2, b6, 0x31));
362 _mm256_store_ps((float*)vertices.data + 8 * (v + 7), _mm256_permute2f128_ps(b3, b7, 0x31));
363 }
364 else {
365 for (size_t ll = 0; ll < 8 && v + ll < vn; ll++) {
366 vertices[v + ll].pos.x = px.m256_f32[ll];
367 vertices[v + ll].pos.y = py.m256_f32[ll];
368 vertices[v + ll].pos.z = pz.m256_f32[ll];
369 vertices[v + ll].nrm.x = gx.m256_f32[ll];
370 vertices[v + ll].nrm.y = gy.m256_f32[ll];
371 vertices[v + ll].nrm.z = gz.m256_f32[ll];
372 vertices[v + ll].tex.x = tp.m256_f32[ll];
373 vertices[v + ll].tex.y = 0.5f;
374 }
375 }
376
377 }
378 //assert(vix == vo);
379 return vn;
380 }
381 //#pragma optimize( "", on )
382
383 unsigned indexExtract(uint32_t* sub_mesh,
384 Idx* indices,
385 uint8_t* tmp,
386 const glm::uvec3 gridLayout,
387 const unsigned T_n,
388 const unsigned /*indexCount*/,
389 const unsigned vertexCount)
390 {
391 const auto cellLayout = gridLayout - glm::uvec3(1, 1, 1);
392 const auto * axesTable = MarchingCubes::axesTable().data();
393 const auto * indexTable = MarchingCubes::indexTable().data();
394
395 unsigned iix = 0;
396 for (unsigned l = 0; l < T_n; l++) {
397 auto * offsets = (uint32_t*)(tmp + gridLayout.x*gridLayout.y*gridLayout.z * sizeof(uint32_t)*l);
398 auto * codes = tmp + gridLayout.x*gridLayout.y*gridLayout.z*(sizeof(uint32_t)*T_n + l);
399 auto * c = codes;
400 for (unsigned k = 0; k < cellLayout.z; k++) {
401 for (unsigned j = 0; j < cellLayout.y; j++) {
402 for (unsigned i = 0; i < cellLayout.x; i++) {
403 auto code = *c++;
404 if (code == 0) continue;
405
406 auto * axisShifts = indexTable + 16 * code;
407 while (true)
408 {
409 auto axisShift = *axisShifts++;
410 if (axisShift == 255) break;
411
412 auto ii = i + (axisShift & 8 ? 1 : 0);
413 auto jj = j + (axisShift & 16 ? 1 : 0);
414 auto kk = k + (axisShift & 32 ? 1 : 0);
415
416 auto shiftedCell = (kk*gridLayout.y + jj)*gridLayout.x + ii;
417
418 const auto shiftedCode = codes[shiftedCell];
419 const auto axes = axesTable[shiftedCode] & axisShift;
420
421 const auto ix = offsets[shiftedCell] + ((axes & 4) ? 1 : 0) + ((axes & 2) ? 1 : 0);
422 assert(ix < vertexCount);
423
424 assert(sizeof(Idx)!=2 || ix < 0xffff);
425 indices[iix++] = (Idx)ix;
426 }
427 }
428
429 c++;
430 }
431 c += gridLayout.x;
432 }
433 sub_mesh[l+1] = iix;
434 }
435
436 return iix;
437 }
438
439//#pragma optimize( "", off)
440 unsigned indexExtractAVX2(uint32_t* indices,
441 uint8_t* tmp,
442 const glm::uvec3 gridLayout_,
443 const unsigned T_n,
444 const unsigned indexCount,
445 const unsigned /*vertexCount*/)
446 {
447 size3_t gridLayout = { gridLayout_.x, gridLayout_.y, gridLayout_.z };
448 size3_t cellLayout = { gridLayout.x - 1, gridLayout.y - 1, gridLayout.z - 1 };
449 const auto * axesTable = MarchingCubes::axesTable().data();
450 auto * idxwork = (uint32_t*)(tmp + gridLayout.x*gridLayout.y*gridLayout.z*((sizeof(uint32_t) + sizeof(uint8_t) + 3 * sizeof(uint32_t))*T_n));
451
452 size3_t gridLayoutL2 = {
453 static_cast<size_t>(log2(gridLayout_.x)),
454 static_cast<size_t>(log2(gridLayout_.y)),
455 static_cast<size_t>(log2(gridLayout_.z))
456 };
457 assert((size_t(1) << gridLayoutL2.x) == gridLayout.x &&
458 (size_t(1) << gridLayoutL2.y) == gridLayout.y &&
459 (size_t(1) << gridLayoutL2.z) == gridLayout.z);
460
461 {
462 auto * offsets = (uint32_t*)tmp;
463 auto * codes = tmp + gridLayout.x*gridLayout.y*gridLayout.z*(sizeof(uint32_t)*T_n);
464
465 const auto one_epi32 = _mm256_set1_epi32(1);
466 const auto three_epi32 = _mm256_set1_epi32(3);
467 const auto gridL2_3_epi64 = _mm_set1_epi64x(gridLayoutL2.x + gridLayoutL2.y + gridLayoutL2.z);
468 const auto X_epi32 = _mm256_set1_epi32(static_cast<int>(gridLayout.x));
469 const auto XY_epi32 = _mm256_set1_epi32(static_cast<int>(gridLayout.x*gridLayout.y));
470 const auto mask_255_epi32 = _mm256_set1_epi32(255);
471 for (size_t iix = 0; iix < indexCount; iix += 8) {
472 const auto w = _mm256_load_si256((const __m256i*)(idxwork + iix));
473 const auto o0 = _mm256_srli_epi32(w, 8);
474
475 // (((w >> 5) & 1), (((w >> 6) & 1) ? gridLayout.x : 0), (((w >> 7) & 1) ? gridLayout.x*gridLayout.y : 0))
476 const auto shiftI = _mm256_and_si256(_mm256_srli_epi32(w, 5), one_epi32);
477 const auto shiftJ = _mm256_and_si256(_mm256_cmpeq_epi32(_mm256_and_si256(_mm256_srli_epi32(w, 6), one_epi32), one_epi32), X_epi32);
478 const auto shiftK = _mm256_and_si256(_mm256_cmpeq_epi32(_mm256_and_si256(_mm256_srli_epi32(w, 7), one_epi32), one_epi32), XY_epi32);
479
480 const auto shiftedCell = _mm256_add_epi32(_mm256_add_epi32(o0, shiftI),
481 _mm256_add_epi32(shiftJ, shiftK));
482
483 const auto L = _mm256_sll_epi32(_mm256_and_si256(w, three_epi32), gridL2_3_epi64); // l << (gridLayoutL2.x + gridLayoutL2.y + gridLayoutL2.z)
484 const auto I = _mm256_add_epi32(shiftedCell, L);
485
486 const auto shiftedCode = _mm256_and_si256( _mm256_i32gather_epi32((const int*)codes, I, 1), mask_255_epi32);
487 const auto axisShift = _mm256_srli_epi32(w, 2);
488 const auto axes = _mm256_and_si256(_mm256_i32gather_epi32((const int*)axesTable, shiftedCode, 1), axisShift);
489
490 const auto addJ = _mm256_and_si256(_mm256_srli_epi32(axes, 1), one_epi32);
491 const auto addK = _mm256_and_si256(_mm256_srli_epi32(axes, 2), one_epi32);
492 const auto offset = _mm256_add_epi32(_mm256_i32gather_epi32((const int*)offsets, I, 4),
493 _mm256_add_epi32(addJ, addK));
494
495 _mm256_store_si256((__m256i*)(indices + iix), offset);
496 }
497
498 }
499 return indexCount;
500 }
501 //#pragma optimize( "", on )
502
503
504}
505
506namespace Cogs::Core::EchoSounder {
507
508
509 //#pragma optimize( "", off )
510 MeshHandle createIsoSurfacesAVX2(Context* context,
511 uint64_t& analyze,
512 uint64_t& vtx,
513 uint64_t& idx,
515 const float* values,
516 const glm::uvec3 gridLayout,
517 const glm::vec3 /*minCorner*/,
518 const glm::vec3 /*maxCorner*/,
519 const float *thresholds, size_t count)
520 {
521 assert(gridLayout.x != 0 && gridLayout.y != 0 && gridLayout.z != 0);
522
523 const uint32_t T_n = (uint32_t)count;
524
525 // grid layout is the layout of sample positions
526 // a cell must be surrounded by samples, so cell layout is one less along each dimension
527 //const auto indexCountTable = MarchingCubes::indexCountTable();
528
529 //const auto * vertexCountTable = MarchingCubes::vertexCountTable().data();
530
531
532 const auto cellLayout = gridLayout - glm::uvec3(1, 1, 1);
533
534 scratch.resize(T_n*(sizeof(uint32_t) + sizeof(uint8_t) + 3 * sizeof(uint32_t) + 15*sizeof(uint32_t))*gridLayout.x*gridLayout.y*gridLayout.z, false);
535 auto * tmp = (uint8_t*)scratch.data();
536
537
538 unsigned occupiedCells = 0;
539 unsigned indexCount = 0;
540 unsigned vertexCount = 0;
541
542 auto timer = Timer::startNew();
543 analyzeAVX2(tmp, vertexCount, occupiedCells, indexCount, values, gridLayout, T_n, thresholds);
544 analyze = timer.elapsedMicroseconds();
545
546 if (occupiedCells == 0) {
547 vtx = 0;
548 idx = 0;
550 }
551
552 auto mesh = context->meshManager->createLocked();
553 //mesh->setBounds(Cogs::Geometry::BoundingBox{ glm::vec3(0,0,0), glm::vec3(gridLayout.x, gridLayout.y, gridLayout.z) });
554 mesh->setMeshFlag(MeshFlags::ClockwiseWinding);
555 mesh->primitiveType = PrimitiveType::TriangleList;
556
557 auto vertices = mesh->map<Vertex>(VertexDataType::Interleaved0, VertexFormats::Pos3fNorm3fTex2f, vertexCount);
558
559 // Extract vertices
560 timer = Timer::startNew();
561 auto vix = vertexExtractAVX2(vertices, tmp, values, gridLayout, T_n, thresholds, vertexCount);
562 assert(vix == vertexCount);
563 vtx = timer.elapsedMicroseconds();
564
565 // extract indices
566 std::vector<uint32_t> sub_mesh(T_n+1, 0);
567 mesh->clearIndexes();
568 Idx *indices = (Idx*)mesh->mapStream(VertexDataType::Indexes, 0, indexCount, sizeof(Idx), true);
569 timer = Timer::startNew();
570 auto iix = indexExtract(sub_mesh.data(), indices, tmp, gridLayout, T_n, indexCount, vertexCount);
571 assert(iix == indexCount);
572 mesh->setMeshFlag(MeshFlags::Indexed);
573 mesh->setMeshFlag(MeshFlags::IndexesChanged);
574 idx = timer.elapsedMicroseconds();
575
576 auto subMeshes = mesh->mapSubMeshes(T_n);
577 for (unsigned l = 0; l < T_n; l++) {
578 uint32_t start = sub_mesh[l];
579 uint32_t size = sub_mesh[l+1]-sub_mesh[l];
580 subMeshes[l] = {start, size, PrimitiveType::TriangleList};
581 }
582 mesh->setCount(indexCount);
583
584 auto box = Geometry::makeEmptyBoundingBox<Geometry::BoundingBox>();
585 for(unsigned i=0; i<vertexCount; i++){
586 glm::vec3 pos = vertices[i].pos;
587 box.min = glm::min(box.min, pos);
588 box.max = glm::max(box.max, pos);
589 }
590 mesh->setBounds(box);
591
592 _mm256_zeroupper();
593 return mesh.getHandle();
594 }
595 //#pragma optimize( "", on )
596
597}
A Context instance contains all the services, systems and runtime components needed to use Cogs.
Definition: Context.h:83
Log implementation class.
Definition: LogManager.h:139
COGSCORE_DLL_API const std::vector< unsigned char > & indexCountTable()
Contains the Engine, Renderer, resource managers and other systems needed to run Cogs....
constexpr Log getLogger(const char(&name)[LEN]) noexcept
Definition: LogManager.h:180
Wrapper for mapped stream data automatically updating the Mesh resource mapped from when destructed.
Definition: Mesh.h:126
Element * data
Pointer to the element data mapped from the Mesh data stream.
Definition: Mesh.h:180
@ IndexesChanged
The index data of the mesh changed.
Definition: Mesh.h:61
@ Indexed
The mesh should be drawn indexed, using index data to order the triangle vertexes.
Definition: Mesh.h:65
@ ClockwiseWinding
The mesh uses clockwise winding order for it's triangles as opposed to the counter-clockwise default.
Definition: Mesh.h:63
static const ResourceHandle_t NoHandle
Handle representing a default (or none if default not present) resource.
@ TriangleList
List of triangles.
Definition: Common.h:116