1#if !defined(EMSCRIPTEN) && !defined(__APPLE__)
4#include "Services/Features.h"
5#include "IsoSurfaces_internal.h"
6#include "Platform/Instrumentation.h"
8#include "Foundation/Platform/Timer.h"
11 static_assert(
false,
"This compile unit must be compiled with AVX2");
17 #include <x86intrin.h>
32 void determineRunlengths(uint32_t& pad_start,
36 const int32_t in_shift,
37 const uint32_t out_runlength,
38 const uint32_t in_runlength)
41 pad_start = min(out_runlength,
static_cast<uint32_t
>(-in_shift));
43 process =
static_cast<uint32_t
>(min(out_runlength - pad_start,
48 in_skip =
static_cast<uint32_t
>(in_shift);
49 process =
static_cast<uint32_t
>(min(max(0u, in_runlength - in_shift),
52 assert(pad_start <= out_runlength);
53 assert(process <= out_runlength);
55 pad_stop = out_runlength - pad_start - process;
59 inline uint32_t* writePadding(uint32_t* dst, uint32_t count,
const __m128i out)
62 while (i + 4 <= count)
64 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(dst + i), out); i += 4;
67 _mm_storel_epi64(
reinterpret_cast<__m128i*
>(dst + i), out); i += 2;
70 dst[i] = out.m128i_u32[0]; i++;
75 template<
typename Type>
76 inline __m128 fetch2(
const Type * mem);
79 inline __m128 fetch2<float>(
const float * mem)
82 return _mm_castpd_ps(_mm_load_sd(
reinterpret_cast<const double*
>(mem)));
86 inline __m128 fetch2<uint16_t>(
const uint16_t * mem)
88 __m128i t = _mm_setzero_si128();
89 t.m128i_u32[0] = *
reinterpret_cast<const uint32_t*
>(mem);
90 t = _mm_unpacklo_epi16(t, _mm_setzero_si128());
91 return _mm_cvtepi32_ps(t);
94 template<
typename Type>
95 inline __m128 fetch4(
const Type * mem);
98 inline __m128 fetch4<float>(
const float * mem)
100 return _mm_loadu_ps(mem);
104 inline __m128 fetch4<uint16_t>(
const uint16_t * mem)
106 __m128i t = _mm_loadl_epi64(
reinterpret_cast<const __m128i*
>(mem));
107 t = _mm_unpacklo_epi16(t, _mm_setzero_si128());
108 return _mm_cvtepi32_ps(t);
111 template<
typename Type>
112 void setInitialBit8(uint64_t* s,
113 const uvec3& fieldDim,
114 const uvec3& tileSize,
119 const bool exteriorIsLess)
121 const uvec3 scratchSize = tileSize + uvec3(1);
123 uint32_t x_pad_start, x_in_skip, x_process, x_pad_stop;
124 determineRunlengths(x_pad_start, x_in_skip, x_process, x_pad_stop,
125 rA.x + gridA.x, scratchSize.x, fieldDim.x);
127 uint32_t y_pad_start, y_in_skip, y_process, y_pad_stop;
128 determineRunlengths(y_pad_start, y_in_skip, y_process, y_pad_stop,
129 rA.y + gridA.y, scratchSize.y, fieldDim.y);
131 uint32_t z_pad_start, z_in_skip, z_process, z_pad_stop;
132 determineRunlengths(z_pad_start, z_in_skip, z_process, z_pad_stop,
133 rA.z + gridA.z, scratchSize.z, fieldDim.z);
135 uint32_t * dst =
reinterpret_cast<uint32_t*
>(s);
137 const __m128i ones = _mm_set1_epi8(1);
138 const __m128i ext = _mm_set1_epi32(exteriorIsLess ? ~0 : 0);
139 const __m128i out = _mm_and_si128(ext, ones);
141 const __m128 t0 = _mm_set_ps(T[3], T[2], T[1], T[0]);
142 const __m128 t1 = _mm_set_ps(T[7], T[6], T[5], T[4]);
144 dst = writePadding(dst, 2 * scratchSize.y * scratchSize.x * z_pad_start, out);
145 for (uint32_t k = 0; k < z_process; k++) {
146 dst = writePadding(dst, 2 * scratchSize.x*y_pad_start, out);
147 for (uint32_t j = 0; j < y_process; j++) {
148 dst = writePadding(dst, 2 * x_pad_start, out);
150 const auto * src = field +
static_cast<int32_t
>(((k + z_in_skip)*fieldDim.y + j + y_in_skip)*fieldDim.x) + x_in_skip;
151 for (uint32_t i = 0; i < (x_process >> 1); i++) {
152 __m128 v = fetch2(src); src += 2;
154 __m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0));
155 __m128i m0 = _mm_castps_si128(_mm_cmplt_ps(v0, t0));
156 __m128i m1 = _mm_castps_si128(_mm_cmplt_ps(v0, t1));
157 __m128i mm0 = _mm_packs_epi32(m0, m1);
159 __m128 v1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1));
160 __m128i m2 = _mm_castps_si128(_mm_cmplt_ps(v1, t0));
161 __m128i m3 = _mm_castps_si128(_mm_cmplt_ps(v1, t1));
162 __m128i mm1 = _mm_packs_epi32(m2, m3);
164 __m128i m = _mm_packs_epi16(mm0, mm1);
165 __m128i b = _mm_and_si128(m, ones);
166 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(dst), b);
170 __m128 v = _mm_set1_ps(*src++);
171 __m128i m0 = _mm_castps_si128(_mm_cmplt_ps(v, t0));
172 __m128i m1 = _mm_castps_si128(_mm_cmplt_ps(v, t1));
173 __m128i mm0 = _mm_packs_epi32(m0, m1);
175 __m128i m = _mm_packs_epi16(mm0, _mm_setzero_si128());
176 __m128i b = _mm_and_si128(m, ones);
178 _mm_storel_epi64(
reinterpret_cast<__m128i*
>(dst), b);
181 dst = writePadding(dst, 2 * x_pad_stop, out);
183 dst = writePadding(dst, 2 * scratchSize.x*y_pad_stop, out);
185 dst = writePadding(dst, 2 * scratchSize.y * scratchSize.x * z_pad_stop, out);
188 template<
typename Type>
189 void setInitialBit4(uint32_t* s,
190 const uvec3& fieldDim,
191 const uvec3& tileSize,
196 const bool exteriorIsLess)
199 const uvec3 scratchSize = tileSize + uvec3(1);
201 uint32_t x_pad_start, x_in_skip, x_process, x_pad_stop;
202 determineRunlengths(x_pad_start, x_in_skip, x_process, x_pad_stop,
203 rA.x + gridA.x, scratchSize.x, fieldDim.x);
205 uint32_t y_pad_start, y_in_skip, y_process, y_pad_stop;
206 determineRunlengths(y_pad_start, y_in_skip, y_process, y_pad_stop,
207 rA.y + gridA.y, scratchSize.y, fieldDim.y);
209 uint32_t z_pad_start, z_in_skip, z_process, z_pad_stop;
210 determineRunlengths(z_pad_start, z_in_skip, z_process, z_pad_stop,
211 rA.z + gridA.z, scratchSize.z, fieldDim.z);
215 const __m128i ones = _mm_set1_epi8(1);
216 const __m128i ext = _mm_set1_epi32(exteriorIsLess ? ~0 : 0);
217 const __m128i out = _mm_and_si128(ext, ones);
219 const __m128 t0 = _mm_set_ps(T[3], T[2], T[1], T[0]);
221 dst = writePadding(dst, scratchSize.y * scratchSize.x * z_pad_start, out);
222 for (uint32_t k = 0; k < z_process; k++) {
223 dst = writePadding(dst, scratchSize.x*y_pad_start, out);
224 for (uint32_t j = 0; j < y_process; j++) {
225 dst = writePadding(dst, x_pad_start, out);
227 const auto * src = field +
static_cast<int32_t
>(((k + z_in_skip)*fieldDim.y + j + y_in_skip)*fieldDim.x) + x_in_skip;
228 for (uint32_t i = 0; i < (x_process >> 2); i++) {
229 __m128 v = fetch4(src); src += 4;
231 __m128i m0 = _mm_castps_si128(_mm_cmplt_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), t0));
232 __m128i m1 = _mm_castps_si128(_mm_cmplt_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), t0));
233 __m128i mm0 = _mm_packs_epi32(m0, m1);
235 __m128i m2 = _mm_castps_si128(_mm_cmplt_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), t0));
236 __m128i m3 = _mm_castps_si128(_mm_cmplt_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), t0));
237 __m128i mm1 = _mm_packs_epi32(m2, m3);
239 __m128i m = _mm_packs_epi16(mm0, mm1);
240 __m128i b = _mm_and_si128(m, ones);
241 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(dst), b);
244 for (uint32_t i = 0; i < (x_process & 3); i++) {
245 __m128 v = _mm_set1_ps(*src++);
246 __m128i m0 = _mm_castps_si128(_mm_cmplt_ps(v, t0));
247 __m128i mm0 = _mm_packs_epi32(m0, _mm_setzero_si128());
248 __m128i m = _mm_packs_epi16(mm0, _mm_setzero_si128());
249 __m128i b = _mm_and_si128(m, ones);
250 *dst++ = b.m128i_u32[0];
252 dst = writePadding(dst, x_pad_stop, out);
254 dst = writePadding(dst, scratchSize.x*y_pad_stop, out);
256 dst = writePadding(dst, scratchSize.y * scratchSize.x * z_pad_stop, out);
259 void xMerge(uint32_t*s,
const uint32_t uint32sInElement,
const uint32_t elements,
const uint32_t inOutShift)
261 const uint32_t* q = s + uint32sInElement;
263 for (; i + 8 < uint32sInElement*elements; i += 8) {
264 __m256i a = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(s + i + inOutShift));
265 __m256i b = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(q + i + inOutShift));
266 __m256i c = _mm256_or_si256(a, _mm256_slli_epi32(b, 1));
267 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(s + i), c);
269 for (; i < uint32sInElement*elements; i++) {
270 s[i] = s[i + inOutShift] | (q[i + inOutShift] << 1);
274 void yMerge(uint32_t* s,
const uint32_t uint32sInARow,
const uint32_t rows)
276 const uint32_t* q = s + uint32sInARow;
278 for (; i + 8 < uint32sInARow*rows; i += 8) {
279 __m256i a = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(s + i));
280 __m256i b = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(q + i));
281 __m256i c = _mm256_or_si256(a, _mm256_slli_epi32(b, 2));
282 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(s + i), c);
284 for (; i < uint32sInARow*rows; i++) {
285 s[i] = s[i] | (q[i] << 2);
290 void zMerge(uint32_t* s,
const uint32_t uint32sInASlice,
const uint32_t slices)
292 const uint32_t* q = s + uint32sInASlice;
295 __m256i zero256 = _mm256_setzero_si256();
296 __m256i ones256 = _mm256_cmpeq_epi32(zero256, zero256);
297 for (; i + 8 < uint32sInASlice*slices; i += 8) {
298 __m256i a = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(s + i));
299 __m256i b = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(q + i));
300 __m256i c = _mm256_or_si256(a, _mm256_slli_epi32(b, 4));
301 __m256i m = _mm256_cmpeq_epi8(c, ones256);
302 c = _mm256_andnot_si256(m, c);
303 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(s + i), c);
305 for (; i < uint32sInASlice*slices; i++) {
306 __m128i zero128 = _mm_setzero_si128();
307 __m128i ones128 = _mm_cmpeq_epi32(zero128, zero128);
308 __m128i a = _mm_setzero_si128(); a.m128i_u32[0] = s[i];
309 __m128i b = _mm_setzero_si128(); b.m128i_u32[0] = q[i];
310 __m128i c = _mm_or_si128(a, _mm_slli_epi32(b, 4));
311 __m128i m = _mm_cmpeq_epi8(c, ones128);
312 c = _mm_andnot_si128(m, c);
313 s[i] = c.m128i_u32[0];
317 void countActiveCells16(uint32_t* Nc,
319 const uvec3& tileSizeClamped,
320 const uvec3& tileSize)
322 const uvec3 scratchSize = tileSize + uvec3(1);
329 __m128i ones4 = _mm_set1_epi32(1);
330 __m128i zero = _mm_setzero_si128();
331 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
332 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
333 const auto * src = s + 4 * (k*scratchSize.y + j)*scratchSize.x;
334 for (uint32_t i = 0; i < tileSizeClamped.x; i++) {
335 __m128i code16_0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src) + i);
336 __m128i code8_0 = _mm_unpacklo_epi8(code16_0, zero);
337 __m128i code8_1 = _mm_unpackhi_epi8(code16_0, zero);
338 __m128i code4_0 = _mm_unpacklo_epi16(code8_0, zero);
339 __m128i code4_1 = _mm_unpackhi_epi16(code8_0, zero);
340 __m128i code4_2 = _mm_unpacklo_epi16(code8_1, zero);
341 __m128i code4_3 = _mm_unpackhi_epi16(code8_1, zero);
342 Nc_[0] = _mm_add_epi32(Nc_[0], _mm_andnot_si128(_mm_cmpeq_epi32(code4_0, zero), ones4));
343 Nc_[1] = _mm_add_epi32(Nc_[1], _mm_andnot_si128(_mm_cmpeq_epi32(code4_1, zero), ones4));
344 Nc_[2] = _mm_add_epi32(Nc_[2], _mm_andnot_si128(_mm_cmpeq_epi32(code4_2, zero), ones4));
345 Nc_[3] = _mm_add_epi32(Nc_[3], _mm_andnot_si128(_mm_cmpeq_epi32(code4_3, zero), ones4));
349 for (uint32_t l = 0; l < 16; l++) {
350 Nc[l] = Nc_[l / 4].m128i_i32[l % 4];
354 void countActiveCells8(uint32_t* Nc,
356 const uvec3& tileSizeClamped,
357 const uvec3& tileSize)
359 const uvec3 scratchSize = tileSize + uvec3(1);
364 __m128i ones4 = _mm_set1_epi32(1);
365 __m128i zero = _mm_setzero_si128();
366 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
367 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
368 const auto * src = s + 2 * (k*scratchSize.y + j)*scratchSize.x;
371 for (; i + 1 < tileSizeClamped.x; i += 2) {
372 __m128i code16_0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src + 2 * i));
373 __m128i code8_0 = _mm_unpacklo_epi8(code16_0, zero);
374 __m128i code8_1 = _mm_unpackhi_epi8(code16_0, zero);
376 __m128i code4_0 = _mm_unpacklo_epi16(code8_0, zero);
377 __m128i code4_2 = _mm_unpacklo_epi16(code8_1, zero);
378 __m128i sum03 = _mm_add_epi32(_mm_andnot_si128(_mm_cmpeq_epi32(code4_0, zero), ones4),
379 _mm_andnot_si128(_mm_cmpeq_epi32(code4_2, zero), ones4));
380 Nc_[0] = _mm_add_epi32(Nc_[0], sum03);
382 __m128i code4_1 = _mm_unpackhi_epi16(code8_0, zero);
383 __m128i code4_3 = _mm_unpackhi_epi16(code8_1, zero);
384 __m128i sum47 = _mm_add_epi32(_mm_andnot_si128(_mm_cmpeq_epi32(code4_1, zero), ones4),
385 _mm_andnot_si128(_mm_cmpeq_epi32(code4_3, zero), ones4));
386 Nc_[1] = _mm_add_epi32(Nc_[1], sum47);
388 for (; i < tileSizeClamped.x; i++) {
389 __m128i code16_0 = _mm_loadl_epi64(
reinterpret_cast<const __m128i*
>(src + 2 * i));
390 __m128i code8_0 = _mm_unpacklo_epi8(code16_0, zero);
391 __m128i code4_0 = _mm_unpacklo_epi16(code8_0, zero);
392 __m128i code4_1 = _mm_unpackhi_epi16(code8_0, zero);
393 Nc_[0] = _mm_add_epi32(Nc_[0], _mm_andnot_si128(_mm_cmpeq_epi32(code4_0, zero), ones4));
394 Nc_[1] = _mm_add_epi32(Nc_[1], _mm_andnot_si128(_mm_cmpeq_epi32(code4_1, zero), ones4));
398 for (uint32_t l = 0; l < 8; l++) {
399 Nc[l] = Nc_[l / 4].m128i_i32[l % 4];
403 void countActiveCells4(uint32_t* Nc,
405 const uvec3& tileSizeClamped,
406 const uvec3& tileSize)
408 const uvec3 scratchSize = tileSize + uvec3(1);
409 __m128i Nc_ = _mm_setzero_si128();
411 __m128i ones4 = _mm_set1_epi32(1);
412 __m128i zero = _mm_setzero_si128();
414 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
415 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
416 const auto * src = s + (k*scratchSize.y + j)*scratchSize.x;
419 for (; i + 4 < tileSizeClamped.x; i += 4) {
420 __m128i code16_0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src + i));
421 __m128i code8_0 = _mm_unpacklo_epi8(code16_0, zero);
422 __m128i code4_0 = _mm_unpacklo_epi16(code8_0, zero);
423 __m128i code4_1 = _mm_unpackhi_epi16(code8_0, zero);
424 __m128i sum0 = _mm_add_epi32(_mm_andnot_si128(_mm_cmpeq_epi32(code4_0, zero), ones4),
425 _mm_andnot_si128(_mm_cmpeq_epi32(code4_1, zero), ones4));
427 __m128i code8_1 = _mm_unpackhi_epi8(code16_0, zero);
428 __m128i code4_2 = _mm_unpacklo_epi16(code8_1, zero);
429 __m128i code4_3 = _mm_unpackhi_epi16(code8_1, zero);
430 __m128i sum1 = _mm_add_epi32(_mm_andnot_si128(_mm_cmpeq_epi32(code4_2, zero), ones4),
431 _mm_andnot_si128(_mm_cmpeq_epi32(code4_3, zero), ones4));
433 Nc_ = _mm_add_epi32(Nc_, _mm_add_epi32(sum0, sum1));
435 for (; i < tileSizeClamped.x; i++) {
436 __m128i code16_0 = zero;
437 code16_0.m128i_u32[0] = src[i];
438 __m128i code8_0 = _mm_unpacklo_epi8(code16_0, zero);
439 __m128i code4_0 = _mm_unpacklo_epi16(code8_0, zero);
440 Nc_ = _mm_add_epi32(Nc_, _mm_andnot_si128(_mm_cmpeq_epi32(code4_0, zero), ones4));
444 for (uint32_t l = 0; l < 4; l++) {
445 Nc[l] = Nc_.m128i_i32[l];
449 void calculateOffsets16(int32_t* cellMap,
450 uint8_t* activeCellCases,
451 int32_t* activeCellIndices,
453 const uvec3& tileSizeClamped,
454 const uvec3& tileSize,
458 const size_t layerStride,
460 const uint32_t lanes)
463 for (uint32_t l = 0; l < lanes; l++) {
467 const uvec3 scratchSize = tileSize + uvec3(1);
468 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
469 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
470 const auto * src = s + (k*scratchSize.y + j)*scratchSize.x;
471 const auto lineOffset = ((k + rA.z)*M.y + (j + rA.y))*M.x;
473 for (uint32_t i = 0; i < tileSizeClamped.x; i++) {
474 __m128i codes = _mm_loadu_si128(src + i);
476 for (uint32_t l = 0; l < lanes; l++) {
477 const auto code = codes.m128i_u8[l];
478 if (code == 0)
continue;
480 const auto t = tOff + l;
481 const auto uncompactedCellIndex = lineOffset + i + rA.x;
482 const auto c = Oc[l] + (Ni[l]++);
483 cellMap[layerStride*t + uncompactedCellIndex] = c;
484 activeCellCases[layerStride*t + c] = code;
485 activeCellIndices[layerStride*t + c] = uncompactedCellIndex;
492 void calculateOffsets8(int32_t* cellMap,
493 uint8_t* activeCellCases,
494 int32_t* activeCellIndices,
496 const uvec3& tileSizeClamped,
497 const uvec3& tileSize,
501 const size_t layerStride,
503 const uint32_t lanes)
506 for (uint32_t l = 0; l < lanes; l++) {
510 const uvec3 scratchSize = tileSize + uvec3(1);
511 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
512 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
513 const auto * src = s + (k*scratchSize.y + j)*scratchSize.x;
514 const auto lineOffset = ((k + rA.z)*M.y + (j + rA.y))*M.x;
516 for (uint32_t i = 0; i < tileSizeClamped.x; i++) {
524 for (uint32_t l = 0; l < lanes; l++) {
525 const auto code = codes.u8[l];
526 if (code == 0)
continue;
528 const auto t = tOff + l;
529 const auto uncompactedCellIndex = lineOffset + i + rA.x;
530 const auto c = Oc[l] + (Ni[l]++);
531 cellMap[layerStride*t + uncompactedCellIndex] = c;
532 activeCellCases[layerStride*t + c] = code;
533 activeCellIndices[layerStride*t + c] = uncompactedCellIndex;
540 void calculateOffsets4(int32_t* cellMap,
541 uint8_t* activeCellCases,
542 int32_t* activeCellIndices,
544 const uvec3& tileSizeClamped,
545 const uvec3& tileSize,
549 const size_t layerStride,
551 const uint32_t lanes)
555 for (uint32_t l = 0; l < lanes; l++) {
559 const uvec3 scratchSize = tileSize + uvec3(1);
560 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
561 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
562 const auto * src = s + (k*scratchSize.y + j)*scratchSize.x;
563 const auto lineOffset = ((k + rA.z)*M.y + (j + rA.y))*M.x;
565 for (uint32_t i = 0; i < tileSizeClamped.x; i++) {
573 for (uint32_t l = 0; l < lanes; l++) {
574 const auto code = codes.u8[l];
575 if (code == 0)
continue;
577 const auto t = tOff + l;
578 const auto uncompactedCellIndex = lineOffset + i + rA.x;
579 const auto c = Oc[l] + (Ni[l]++);
580 cellMap[layerStride*t + uncompactedCellIndex] = c;
581 activeCellCases[layerStride*t + c] = code;
582 activeCellIndices[layerStride*t + c] = uncompactedCellIndex;
589 void merge(uint32_t* s,
590 const uint32_t uint32XShift,
591 const uint32_t uint32sPerElement,
592 const uint32_t tileX,
593 const uint32_t tileY,
594 const uint32_t tileZ)
596 xMerge(s, uint32sPerElement, (tileX + 1)*(tileY + 1)*(tileZ + 1) - 1, uint32XShift);
597 yMerge(s, uint32sPerElement * (tileX + 1), (tileY + 1)*(tileZ + 1) - 1);
598 zMerge(s, uint32sPerElement * (tileX + 1)*(tileY + 1), tileZ);
604void Cogs::Core::IsoSurfaces::analyzeTile_f32_AVX2(AnalyzeGlobalState* g,
const glm::ivec3
id)
606 CpuInstrumentationScope(SCOPE_ISOSURFACES,
"analyzeTile_f32_AVX2");
607 auto timer = Timer::startNew();
609 const float* field = (
const float*)g->field;
610 const float* thresholds = (
const float*)g->thresholds;
612 const uvec3 fieldDim = uvec3(g->fieldDim);
613 const uvec3 tileSize = uvec3(g->tileSize);
614 const uvec3 scratchSize = tileSize + uvec3(1);
615 const uvec3 M = uvec3(g->M);
616 const auto exteriorIsLess = g->exteriorIsLess;
617 uvec3 rA = tileSize * uvec3(
id);
618 uvec3 rB = glm::min(M, rA + tileSize);
619 const auto tileSizeClamped = glm::min(tileSize, rB - rA);
620 const size_t layerStride = g->M.x * g->M.y * g->M.z;
621 const auto Nt =
static_cast<uint32_t
>(g->Nt);
626 auto * scratch = g->scratchAcquire(4 *
sizeof(
int) * (scratchSize.x * scratchSize.y * scratchSize.z + 1 + 4));
627 auto * s =
reinterpret_cast<uint32_t*
>(scratch->data());
633 for (; tOff + 4 < Nt; tOff += 8) {
634 auto lanes = min(8u, Nt - tOff);
635 for (uint32_t i = 0; i < 8; i++) {
636 T[i] = thresholds[tOff + min(i, lanes - 1)];
638 setInitialBit8(
reinterpret_cast<uint64_t*
>(s) + 8, fieldDim, tileSize, rA, g->gridA, T, field, exteriorIsLess);
639 merge(s, 16, 2, tileSize.x, tileSize.y, tileSize.z);
640 countActiveCells8(Nc, s, tileSizeClamped, tileSize);
641 for (uint32_t l = 0; l < lanes; l++) {
642 Oc[l] = g->cellOffsets[l].fetch_add(Nc[l]);
644 calculateOffsets8(g->cellMap, g->activeCellCases, g->activeCellIndices,
645 reinterpret_cast<uint64_t*
>(s),
646 tileSizeClamped, tileSize, rA, M,
647 Oc, layerStride, tOff, lanes);
651 for (; tOff < Nt; tOff += 4) {
652 auto lanes = min(4u, Nt - tOff);
653 for (uint32_t i = 0; i < 4; i++) {
654 T[i] = thresholds[tOff + min(i, lanes - 1)];
656 setInitialBit4(s + 16, fieldDim, tileSize, rA, g->gridA, T, field, exteriorIsLess);
657 merge(
reinterpret_cast<uint32_t*
>(s), 16, 1, tileSize.x, tileSize.y, tileSize.z);
658 countActiveCells4(Nc, s, tileSizeClamped, tileSize);
659 for (uint32_t l = 0; l < lanes; l++) {
660 Oc[l] = g->cellOffsets[l].fetch_add(Nc[l]);
662 calculateOffsets4(g->cellMap, g->activeCellCases, g->activeCellIndices,
664 tileSizeClamped, tileSize, rA, M,
665 Oc, layerStride, tOff, lanes);
668 g->scratchRelease(scratch);
670 if (g->elapsed_us !=
nullptr) {
671 g->elapsed_us->fetch_add(timer.elapsedMicroseconds());
677void Cogs::Core::IsoSurfaces::analyzeTile_u16_AVX2(AnalyzeGlobalState* g,
const glm::ivec3
id)
679 CpuInstrumentationScope(SCOPE_ISOSURFACES,
"analyzeTile_u16_AVX2");
680 auto timer = Timer::startNew();
682 const uint16_t* field = (
const uint16_t*)g->field;
683 const uint16_t* thresholds = (
const uint16_t*)g->thresholds;
685 const uvec3 fieldDim = uvec3(g->fieldDim);
686 const uvec3 tileSize = uvec3(g->tileSize);
687 const uvec3 scratchSize = tileSize + uvec3(1);
688 const uvec3 M = uvec3(g->M);
689 const auto exteriorIsLess = g->exteriorIsLess;
690 uvec3 rA = tileSize * uvec3(
id);
691 uvec3 rB = glm::min(M, rA + tileSize);
692 const auto tileSizeClamped = glm::min(tileSize, rB - rA);
693 const size_t layerStride = g->M.x * g->M.y * g->M.z;
694 const auto Nt =
static_cast<uint32_t
>(g->Nt);
699 auto * scratch = g->scratchAcquire(4 *
sizeof(
int) * (scratchSize.x * scratchSize.y * scratchSize.z + 1 + 4));
700 auto * s =
reinterpret_cast<uint32_t*
>(scratch->data());
705 for (; tOff + 4 < Nt; tOff += 8) {
706 auto lanes = min(8u, Nt - tOff);
707 for (uint32_t i = 0; i < 8; i++) {
708 T[i] = thresholds[tOff + min(i, lanes - 1)];
710 setInitialBit8(
reinterpret_cast<uint64_t*
>(s) + 8, fieldDim, tileSize, rA, g->gridA, T, field, exteriorIsLess);
711 merge(s, 16, 2, tileSize.x, tileSize.y, tileSize.z);
712 countActiveCells8(Nc, s, tileSizeClamped, tileSize);
713 for (uint32_t l = 0; l < lanes; l++) {
714 Oc[l] = g->cellOffsets[l].fetch_add(Nc[l]);
716 calculateOffsets8(g->cellMap, g->activeCellCases, g->activeCellIndices,
717 reinterpret_cast<uint64_t*
>(s),
718 tileSizeClamped, tileSize, rA, M,
719 Oc, layerStride, tOff, lanes);
723 for (; tOff < Nt; tOff += 4) {
724 auto lanes = min(4u, Nt - tOff);
725 for (uint32_t i = 0; i < 4; i++) {
726 T[i] = thresholds[tOff + min(i, lanes - 1)];
728 setInitialBit4(s + 16, fieldDim, tileSize, rA, g->gridA, T, field, exteriorIsLess);
729 merge(
reinterpret_cast<uint32_t*
>(s), 16, 1, tileSize.x, tileSize.y, tileSize.z);
730 countActiveCells4(Nc, s, tileSizeClamped, tileSize);
731 for (uint32_t l = 0; l < lanes; l++) {
732 Oc[l] = g->cellOffsets[l].fetch_add(Nc[l]);
734 calculateOffsets4(g->cellMap, g->activeCellCases, g->activeCellIndices,
736 tileSizeClamped, tileSize, rA, M,
737 Oc, layerStride, tOff, lanes);
740 g->scratchRelease(scratch);
742 if (g->elapsed_us !=
nullptr) {
743 g->elapsed_us->fetch_add(timer.elapsedMicroseconds());
Contains the Engine, Renderer, resource managers and other systems needed to run Cogs....