1#if !defined(EMSCRIPTEN) && !defined(__APPLE__)
3#include "Services/Features.h"
4#include "IsoSurfaces_internal.h"
5#include "Platform/Instrumentation.h"
7#include "Foundation/Platform/Timer.h"
11 #include <x86intrin.h>
27 void determineRunlengths(uint32_t& pad_start,
31 const int32_t in_shift,
32 const uint32_t out_runlength,
33 const uint32_t in_runlength)
36 pad_start = min(out_runlength,
static_cast<uint32_t
>(-in_shift));
38 process =
static_cast<uint32_t
>(min(out_runlength - pad_start,
43 in_skip =
static_cast<uint32_t
>(in_shift);
44 process =
static_cast<uint32_t
>(min(max(0u, in_runlength - in_shift),
47 assert(pad_start <= out_runlength);
48 assert(process <= out_runlength);
50 pad_stop = out_runlength - pad_start - process;
54 inline uint32_t* writePadding(uint32_t* dst, uint32_t count,
const __m128i out)
57 while (i + 4 <= count)
59 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(dst + i), out); i += 4;
62 _mm_storel_epi64(
reinterpret_cast<__m128i*
>(dst + i), out); i += 2;
65 dst[i] = out.m128i_u32[0]; i++;
70 template<
typename Type>
71 inline __m128 fetch2(
const Type * mem);
74 inline __m128 fetch2<float>(
const float * mem)
77 return _mm_castpd_ps(_mm_load_sd(
reinterpret_cast<const double*
>(mem)));
81 inline __m128 fetch2<uint16_t>(
const uint16_t * mem)
83 __m128i t = _mm_setzero_si128();
84 t.m128i_u32[0] = *
reinterpret_cast<const uint32_t*
>(mem);
85 t = _mm_unpacklo_epi16(t, _mm_setzero_si128());
86 return _mm_cvtepi32_ps(t);
89 template<
typename Type>
90 inline __m128 fetch4(
const Type * mem);
93 inline __m128 fetch4<float>(
const float * mem)
95 return _mm_loadu_ps(mem);
99 inline __m128 fetch4<uint16_t>(
const uint16_t * mem)
101 __m128i t = _mm_loadl_epi64(
reinterpret_cast<const __m128i*
>(mem));
102 t = _mm_unpacklo_epi16(t, _mm_setzero_si128());
103 return _mm_cvtepi32_ps(t);
106 template<
typename Type>
107 void setInitialBit8(uint64_t* s,
108 const uvec3& fieldDim,
109 const uvec3& tileSize,
114 const bool exteriorIsLess)
116 const uvec3 scratchSize = tileSize + uvec3(1);
118 uint32_t x_pad_start, x_in_skip, x_process, x_pad_stop;
119 determineRunlengths(x_pad_start, x_in_skip, x_process, x_pad_stop,
120 rA.x + gridA.x, scratchSize.x, fieldDim.x);
122 uint32_t y_pad_start, y_in_skip, y_process, y_pad_stop;
123 determineRunlengths(y_pad_start, y_in_skip, y_process, y_pad_stop,
124 rA.y + gridA.y, scratchSize.y, fieldDim.y);
126 uint32_t z_pad_start, z_in_skip, z_process, z_pad_stop;
127 determineRunlengths(z_pad_start, z_in_skip, z_process, z_pad_stop,
128 rA.z + gridA.z, scratchSize.z, fieldDim.z);
130 uint32_t * dst =
reinterpret_cast<uint32_t*
>(s);
132 const __m128i ones = _mm_set1_epi8(1);
133 const __m128i ext = _mm_set1_epi32(exteriorIsLess ? ~0 : 0);
134 const __m128i out = _mm_and_si128(ext, ones);
136 const __m128 t0 = _mm_set_ps(T[3], T[2], T[1], T[0]);
137 const __m128 t1 = _mm_set_ps(T[7], T[6], T[5], T[4]);
139 dst = writePadding(dst, 2 * scratchSize.y * scratchSize.x * z_pad_start, out);
140 for (uint32_t k = 0; k < z_process; k++) {
141 dst = writePadding(dst, 2 * scratchSize.x*y_pad_start, out);
142 for (uint32_t j = 0; j < y_process; j++) {
143 dst = writePadding(dst, 2 * x_pad_start, out);
145 const auto * src = field +
static_cast<int32_t
>(((k + z_in_skip)*fieldDim.y + j + y_in_skip)*fieldDim.x) + x_in_skip;
146 for (uint32_t i = 0; i < (x_process >> 1); i++) {
147 __m128 v = fetch2(src); src += 2;
149 __m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0));
150 __m128i m0 = _mm_castps_si128(_mm_cmplt_ps(v0, t0));
151 __m128i m1 = _mm_castps_si128(_mm_cmplt_ps(v0, t1));
152 __m128i mm0 = _mm_packs_epi32(m0, m1);
154 __m128 v1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1));
155 __m128i m2 = _mm_castps_si128(_mm_cmplt_ps(v1, t0));
156 __m128i m3 = _mm_castps_si128(_mm_cmplt_ps(v1, t1));
157 __m128i mm1 = _mm_packs_epi32(m2, m3);
159 __m128i m = _mm_packs_epi16(mm0, mm1);
160 __m128i b = _mm_and_si128(m, ones);
161 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(dst), b);
165 __m128 v = _mm_set1_ps(*src++);
166 __m128i m0 = _mm_castps_si128(_mm_cmplt_ps(v, t0));
167 __m128i m1 = _mm_castps_si128(_mm_cmplt_ps(v, t1));
168 __m128i mm0 = _mm_packs_epi32(m0, m1);
170 __m128i m = _mm_packs_epi16(mm0, _mm_setzero_si128());
171 __m128i b = _mm_and_si128(m, ones);
173 _mm_storel_epi64(
reinterpret_cast<__m128i*
>(dst), b);
176 dst = writePadding(dst, 2 * x_pad_stop, out);
178 dst = writePadding(dst, 2 * scratchSize.x*y_pad_stop, out);
180 dst = writePadding(dst, 2 * scratchSize.y * scratchSize.x * z_pad_stop, out);
183 template<
typename Type>
184 void setInitialBit4(uint32_t* s,
185 const uvec3& fieldDim,
186 const uvec3& tileSize,
191 const bool exteriorIsLess)
194 const uvec3 scratchSize = tileSize + uvec3(1);
196 uint32_t x_pad_start, x_in_skip, x_process, x_pad_stop;
197 determineRunlengths(x_pad_start, x_in_skip, x_process, x_pad_stop,
198 rA.x + gridA.x, scratchSize.x, fieldDim.x);
200 uint32_t y_pad_start, y_in_skip, y_process, y_pad_stop;
201 determineRunlengths(y_pad_start, y_in_skip, y_process, y_pad_stop,
202 rA.y + gridA.y, scratchSize.y, fieldDim.y);
204 uint32_t z_pad_start, z_in_skip, z_process, z_pad_stop;
205 determineRunlengths(z_pad_start, z_in_skip, z_process, z_pad_stop,
206 rA.z + gridA.z, scratchSize.z, fieldDim.z);
210 const __m128i ones = _mm_set1_epi8(1);
211 const __m128i ext = _mm_set1_epi32(exteriorIsLess ? ~0 : 0);
212 const __m128i out = _mm_and_si128(ext, ones);
214 const __m128 t0 = _mm_set_ps(T[3], T[2], T[1], T[0]);
216 dst = writePadding(dst, scratchSize.y * scratchSize.x * z_pad_start, out);
217 for (uint32_t k = 0; k < z_process; k++) {
218 dst = writePadding(dst, scratchSize.x*y_pad_start, out);
219 for (uint32_t j = 0; j < y_process; j++) {
220 dst = writePadding(dst, x_pad_start, out);
222 const auto * src = field +
static_cast<int32_t
>(((k + z_in_skip)*fieldDim.y + j + y_in_skip)*fieldDim.x) + x_in_skip;
223 for (uint32_t i = 0; i < (x_process >> 2); i++) {
224 __m128 v = fetch4(src); src += 4;
226 __m128i m0 = _mm_castps_si128(_mm_cmplt_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), t0));
227 __m128i m1 = _mm_castps_si128(_mm_cmplt_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), t0));
228 __m128i mm0 = _mm_packs_epi32(m0, m1);
230 __m128i m2 = _mm_castps_si128(_mm_cmplt_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), t0));
231 __m128i m3 = _mm_castps_si128(_mm_cmplt_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), t0));
232 __m128i mm1 = _mm_packs_epi32(m2, m3);
234 __m128i m = _mm_packs_epi16(mm0, mm1);
235 __m128i b = _mm_and_si128(m, ones);
236 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(dst), b);
239 for (uint32_t i = 0; i < (x_process & 3); i++) {
240 __m128 v = _mm_set1_ps(*src++);
241 __m128i m0 = _mm_castps_si128(_mm_cmplt_ps(v, t0));
242 __m128i mm0 = _mm_packs_epi32(m0, _mm_setzero_si128());
243 __m128i m = _mm_packs_epi16(mm0, _mm_setzero_si128());
244 __m128i b = _mm_and_si128(m, ones);
245 *dst++ = b.m128i_u32[0];
247 dst = writePadding(dst, x_pad_stop, out);
249 dst = writePadding(dst, scratchSize.x*y_pad_stop, out);
251 dst = writePadding(dst, scratchSize.y * scratchSize.x * z_pad_stop, out);
254 void xMerge(uint32_t*s,
const uint32_t uint32sInElement,
const uint32_t elements,
const uint32_t inOutShift)
256 const uint32_t* q = s + uint32sInElement;
258 for (; i + 4 < uint32sInElement*elements; i += 4) {
259 __m128i a = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(s + i + inOutShift));
260 __m128i b = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(q + i + inOutShift));
261 __m128i c = _mm_or_si128(a, _mm_slli_epi32(b, 1));
262 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(s + i), c);
264 for (; i < uint32sInElement*elements; i++) {
265 s[i] = s[i + inOutShift] | (q[i + inOutShift] << 1);
269 void yMerge(uint32_t* s,
const uint32_t uint32sInARow,
const uint32_t rows)
271 const uint32_t* q = s + uint32sInARow;
273 for (; i + 4 < uint32sInARow*rows; i += 4) {
274 __m128i a = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(s + i));
275 __m128i b = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(q + i));
276 __m128i c = _mm_or_si128(a, _mm_slli_epi32(b, 2));
277 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(s + i), c);
279 for (; i < uint32sInARow*rows; i++) {
280 s[i] = s[i] | (q[i] << 2);
285 void zMerge(uint32_t* s,
const uint32_t uint32sInASlice,
const uint32_t slices)
287 __m128i zero = _mm_setzero_si128();
288 __m128i ones = _mm_cmpeq_epi32(zero, zero);
289 const uint32_t* q = s + uint32sInASlice;
291 for (; i + 4 < uint32sInASlice*slices; i += 4) {
292 __m128i a = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(s + i));
293 __m128i b = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(q + i));
294 __m128i c = _mm_or_si128(a, _mm_slli_epi32(b, 4));
295 __m128i m = _mm_cmpeq_epi8(c, ones);
296 c = _mm_andnot_si128(m, c);
297 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(s + i), c);
299 for (; i < uint32sInASlice*slices; i++) {
300 __m128i a = _mm_setzero_si128(); a.m128i_u32[0] = s[i];
301 __m128i b = _mm_setzero_si128(); b.m128i_u32[0] = q[i];
302 __m128i c = _mm_or_si128(a, _mm_slli_epi32(b, 4));
303 __m128i m = _mm_cmpeq_epi8(c, ones);
304 c = _mm_andnot_si128(m, c);
305 s[i] = c.m128i_u32[0];
309 void countActiveCells16(uint32_t* Nc,
311 const uvec3& tileSizeClamped,
312 const uvec3& tileSize)
314 const uvec3 scratchSize = tileSize + uvec3(1);
321 __m128i ones4 = _mm_set1_epi32(1);
322 __m128i zero = _mm_setzero_si128();
323 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
324 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
325 const auto * src = s + 4 * (k*scratchSize.y + j)*scratchSize.x;
326 for (uint32_t i = 0; i < tileSizeClamped.x; i++) {
327 __m128i code16_0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src) + i);
328 __m128i code8_0 = _mm_unpacklo_epi8(code16_0, zero);
329 __m128i code8_1 = _mm_unpackhi_epi8(code16_0, zero);
330 __m128i code4_0 = _mm_unpacklo_epi16(code8_0, zero);
331 __m128i code4_1 = _mm_unpackhi_epi16(code8_0, zero);
332 __m128i code4_2 = _mm_unpacklo_epi16(code8_1, zero);
333 __m128i code4_3 = _mm_unpackhi_epi16(code8_1, zero);
334 Nc_[0] = _mm_add_epi32(Nc_[0], _mm_andnot_si128(_mm_cmpeq_epi32(code4_0, zero), ones4));
335 Nc_[1] = _mm_add_epi32(Nc_[1], _mm_andnot_si128(_mm_cmpeq_epi32(code4_1, zero), ones4));
336 Nc_[2] = _mm_add_epi32(Nc_[2], _mm_andnot_si128(_mm_cmpeq_epi32(code4_2, zero), ones4));
337 Nc_[3] = _mm_add_epi32(Nc_[3], _mm_andnot_si128(_mm_cmpeq_epi32(code4_3, zero), ones4));
341 for (uint32_t l = 0; l < 16; l++) {
342 Nc[l] = Nc_[l / 4].m128i_i32[l % 4];
346 void countActiveCells8(uint32_t* Nc,
348 const uvec3& tileSizeClamped,
349 const uvec3& tileSize)
351 const uvec3 scratchSize = tileSize + uvec3(1);
356 __m128i ones4 = _mm_set1_epi32(1);
357 __m128i zero = _mm_setzero_si128();
358 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
359 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
360 const auto * src = s + 2 * (k*scratchSize.y + j)*scratchSize.x;
363 for (; i + 1 < tileSizeClamped.x; i += 2) {
364 __m128i code16_0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src + 2 * i));
365 __m128i code8_0 = _mm_unpacklo_epi8(code16_0, zero);
366 __m128i code8_1 = _mm_unpackhi_epi8(code16_0, zero);
368 __m128i code4_0 = _mm_unpacklo_epi16(code8_0, zero);
369 __m128i code4_2 = _mm_unpacklo_epi16(code8_1, zero);
370 __m128i sum03 = _mm_add_epi32(_mm_andnot_si128(_mm_cmpeq_epi32(code4_0, zero), ones4),
371 _mm_andnot_si128(_mm_cmpeq_epi32(code4_2, zero), ones4));
372 Nc_[0] = _mm_add_epi32(Nc_[0], sum03);
374 __m128i code4_1 = _mm_unpackhi_epi16(code8_0, zero);
375 __m128i code4_3 = _mm_unpackhi_epi16(code8_1, zero);
376 __m128i sum47 = _mm_add_epi32(_mm_andnot_si128(_mm_cmpeq_epi32(code4_1, zero), ones4),
377 _mm_andnot_si128(_mm_cmpeq_epi32(code4_3, zero), ones4));
378 Nc_[1] = _mm_add_epi32(Nc_[1], sum47);
380 for (; i < tileSizeClamped.x; i++) {
381 __m128i code16_0 = _mm_loadl_epi64(
reinterpret_cast<const __m128i*
>(src + 2 * i));
382 __m128i code8_0 = _mm_unpacklo_epi8(code16_0, zero);
383 __m128i code4_0 = _mm_unpacklo_epi16(code8_0, zero);
384 __m128i code4_1 = _mm_unpackhi_epi16(code8_0, zero);
385 Nc_[0] = _mm_add_epi32(Nc_[0], _mm_andnot_si128(_mm_cmpeq_epi32(code4_0, zero), ones4));
386 Nc_[1] = _mm_add_epi32(Nc_[1], _mm_andnot_si128(_mm_cmpeq_epi32(code4_1, zero), ones4));
390 for (uint32_t l = 0; l < 8; l++) {
391 Nc[l] = Nc_[l / 4].m128i_i32[l % 4];
395 void countActiveCells4(uint32_t* Nc,
397 const uvec3& tileSizeClamped,
398 const uvec3& tileSize)
400 const uvec3 scratchSize = tileSize + uvec3(1);
401 __m128i Nc_ = _mm_setzero_si128();
403 __m128i ones4 = _mm_set1_epi32(1);
404 __m128i zero = _mm_setzero_si128();
406 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
407 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
408 const auto * src = s + (k*scratchSize.y + j)*scratchSize.x;
411 for (; i + 4 < tileSizeClamped.x; i += 4) {
412 __m128i code16_0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src + i));
413 __m128i code8_0 = _mm_unpacklo_epi8(code16_0, zero);
414 __m128i code4_0 = _mm_unpacklo_epi16(code8_0, zero);
415 __m128i code4_1 = _mm_unpackhi_epi16(code8_0, zero);
416 __m128i sum0 = _mm_add_epi32(_mm_andnot_si128(_mm_cmpeq_epi32(code4_0, zero), ones4),
417 _mm_andnot_si128(_mm_cmpeq_epi32(code4_1, zero), ones4));
419 __m128i code8_1 = _mm_unpackhi_epi8(code16_0, zero);
420 __m128i code4_2 = _mm_unpacklo_epi16(code8_1, zero);
421 __m128i code4_3 = _mm_unpackhi_epi16(code8_1, zero);
422 __m128i sum1 = _mm_add_epi32(_mm_andnot_si128(_mm_cmpeq_epi32(code4_2, zero), ones4),
423 _mm_andnot_si128(_mm_cmpeq_epi32(code4_3, zero), ones4));
425 Nc_ = _mm_add_epi32(Nc_, _mm_add_epi32(sum0, sum1));
427 for (; i < tileSizeClamped.x; i++) {
428 __m128i code16_0 = zero;
429 code16_0.m128i_u32[0] = src[i];
430 __m128i code8_0 = _mm_unpacklo_epi8(code16_0, zero);
431 __m128i code4_0 = _mm_unpacklo_epi16(code8_0, zero);
432 Nc_ = _mm_add_epi32(Nc_, _mm_andnot_si128(_mm_cmpeq_epi32(code4_0, zero), ones4));
436 for (uint32_t l = 0; l < 4; l++) {
437 Nc[l] = Nc_.m128i_i32[l];
441 void calculateOffsets16(int32_t* cellMap,
442 uint8_t* activeCellCases,
443 int32_t* activeCellIndices,
445 const uvec3& tileSizeClamped,
446 const uvec3& tileSize,
450 const size_t layerStride,
452 const uint32_t lanes)
455 for (uint32_t l = 0; l < lanes; l++) {
459 const uvec3 scratchSize = tileSize + uvec3(1);
460 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
461 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
462 const auto * src = s + (k*scratchSize.y + j)*scratchSize.x;
463 const auto lineOffset = ((k + rA.z)*M.y + (j + rA.y))*M.x;
465 for (uint32_t i = 0; i < tileSizeClamped.x; i++) {
466 __m128i codes = _mm_loadu_si128(src + i);
468 for (uint32_t l = 0; l < lanes; l++) {
469 const auto code = codes.m128i_u8[l];
470 if (code == 0)
continue;
472 const auto t = tOff + l;
473 const auto uncompactedCellIndex = lineOffset + i + rA.x;
474 const auto c = Oc[l] + (Ni[l]++);
475 cellMap[layerStride*t + uncompactedCellIndex] = c;
476 activeCellCases[layerStride*t + c] = code;
477 activeCellIndices[layerStride*t + c] = uncompactedCellIndex;
484 void calculateOffsets8(int32_t* cellMap,
485 uint8_t* activeCellCases,
486 int32_t* activeCellIndices,
488 const uvec3& tileSizeClamped,
489 const uvec3& tileSize,
493 const size_t layerStride,
495 const uint32_t lanes)
498 for (uint32_t l = 0; l < lanes; l++) {
502 const uvec3 scratchSize = tileSize + uvec3(1);
503 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
504 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
505 const auto * src = s + (k*scratchSize.y + j)*scratchSize.x;
506 const auto lineOffset = ((k + rA.z)*M.y + (j + rA.y))*M.x;
508 for (uint32_t i = 0; i < tileSizeClamped.x; i++) {
516 for (uint32_t l = 0; l < lanes; l++) {
517 const auto code = codes.u8[l];
518 if (code == 0)
continue;
520 const auto t = tOff + l;
521 const auto uncompactedCellIndex = lineOffset + i + rA.x;
522 const auto c = Oc[l] + (Ni[l]++);
523 cellMap[layerStride*t + uncompactedCellIndex] = c;
524 activeCellCases[layerStride*t + c] = code;
525 activeCellIndices[layerStride*t + c] = uncompactedCellIndex;
532 void calculateOffsets4(int32_t* cellMap,
533 uint8_t* activeCellCases,
534 int32_t* activeCellIndices,
536 const uvec3& tileSizeClamped,
537 const uvec3& tileSize,
541 const size_t layerStride,
543 const uint32_t lanes)
547 for (uint32_t l = 0; l < lanes; l++) {
551 const uvec3 scratchSize = tileSize + uvec3(1);
552 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
553 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
554 const auto * src = s + (k*scratchSize.y + j)*scratchSize.x;
555 const auto lineOffset = ((k + rA.z)*M.y + (j + rA.y))*M.x;
557 for (uint32_t i = 0; i < tileSizeClamped.x; i++) {
565 for (uint32_t l = 0; l < lanes; l++) {
566 const auto code = codes.u8[l];
567 if (code == 0)
continue;
569 const auto t = tOff + l;
570 const auto uncompactedCellIndex = lineOffset + i + rA.x;
571 const auto c = Oc[l] + (Ni[l]++);
572 cellMap[layerStride*t + uncompactedCellIndex] = c;
573 activeCellCases[layerStride*t + c] = code;
574 activeCellIndices[layerStride*t + c] = uncompactedCellIndex;
581 void merge(uint32_t* s,
582 const uint32_t uint32XShift,
583 const uint32_t uint32sPerElement,
584 const uint32_t tileX,
585 const uint32_t tileY,
586 const uint32_t tileZ)
588 xMerge(s, uint32sPerElement, (tileX + 1)*(tileY + 1)*(tileZ + 1) - 1, uint32XShift);
589 yMerge(s, uint32sPerElement * (tileX + 1), (tileY + 1)*(tileZ + 1) - 1);
590 zMerge(s, uint32sPerElement * (tileX + 1)*(tileY + 1), tileZ);
596void Cogs::Core::IsoSurfaces::analyzeTile_f32_SSE(AnalyzeGlobalState* g,
const glm::ivec3
id)
598 CpuInstrumentationScope(SCOPE_ISOSURFACES,
"analyzeTile_f32_SSE");
599 auto timer = Timer::startNew();
601 const float* field = (
const float*)g->field;
602 const float* thresholds = (
const float*)g->thresholds;
604 const uvec3 fieldDim = uvec3(g->fieldDim);
605 const uvec3 tileSize = uvec3(g->tileSize);
606 const uvec3 scratchSize = tileSize + uvec3(1);
607 const uvec3 M = uvec3(g->M);
608 const auto exteriorIsLess = g->exteriorIsLess;
609 uvec3 rA = tileSize * uvec3(
id);
610 uvec3 rB = glm::min(M, rA + tileSize);
611 const auto tileSizeClamped = glm::min(tileSize, rB - rA);
612 const size_t layerStride = g->M.x * g->M.y * g->M.z;
613 const auto Nt =
static_cast<uint32_t
>(g->Nt);
618 auto * scratch = g->scratchAcquire(4 *
sizeof(
int) * (scratchSize.x * scratchSize.y * scratchSize.z + 1 + 4));
619 auto * s =
reinterpret_cast<uint32_t*
>(scratch->data());
624 for (; tOff + 4 < Nt; tOff += 8) {
625 auto lanes = min(8u, Nt - tOff);
626 for (uint32_t i = 0; i < 8; i++) {
627 T[i] = thresholds[tOff + min(i, lanes - 1)];
629 setInitialBit8(
reinterpret_cast<uint64_t*
>(s) + 8, fieldDim, tileSize, rA, g->gridA, T, field, exteriorIsLess);
630 merge(s, 16, 2, tileSize.x, tileSize.y, tileSize.z);
631 countActiveCells8(Nc, s, tileSizeClamped, tileSize);
632 for (uint32_t l = 0; l < lanes; l++) {
633 Oc[l] = g->cellOffsets[l].fetch_add(Nc[l]);
635 calculateOffsets8(g->cellMap, g->activeCellCases, g->activeCellIndices,
636 reinterpret_cast<uint64_t*
>(s),
637 tileSizeClamped, tileSize, rA, M,
638 Oc, layerStride, tOff, lanes);
642 for (; tOff < Nt; tOff += 4) {
643 auto lanes = min(4u, Nt - tOff);
644 for (uint32_t i = 0; i < 4; i++) {
645 T[i] = thresholds[tOff + min(i, lanes - 1)];
647 setInitialBit4(s + 16, fieldDim, tileSize, rA, g->gridA, T, field, exteriorIsLess);
648 merge(
reinterpret_cast<uint32_t*
>(s), 16, 1, tileSize.x, tileSize.y, tileSize.z);
649 countActiveCells4(Nc, s, tileSizeClamped, tileSize);
650 for (uint32_t l = 0; l < lanes; l++) {
651 Oc[l] = g->cellOffsets[l].fetch_add(Nc[l]);
653 calculateOffsets4(g->cellMap, g->activeCellCases, g->activeCellIndices,
655 tileSizeClamped, tileSize, rA, M,
656 Oc, layerStride, tOff, lanes);
659 g->scratchRelease(scratch);
661 if (g->elapsed_us !=
nullptr) {
662 g->elapsed_us->fetch_add(timer.elapsedMicroseconds());
666void Cogs::Core::IsoSurfaces::analyzeTile_u16_SSE(AnalyzeGlobalState* g,
const glm::ivec3
id)
668 CpuInstrumentationScope(SCOPE_ISOSURFACES,
"analyzeTile_u16_SSE");
669 auto timer = Timer::startNew();
671 const uint16_t* field = (
const uint16_t*)g->field;
672 const uint16_t* thresholds = (
const uint16_t*)g->thresholds;
674 const uvec3 fieldDim = uvec3(g->fieldDim);
675 const uvec3 tileSize = uvec3(g->tileSize);
676 const uvec3 scratchSize = tileSize + uvec3(1);
677 const uvec3 M = uvec3(g->M);
678 const auto exteriorIsLess = g->exteriorIsLess;
679 uvec3 rA = tileSize * uvec3(
id);
680 uvec3 rB = glm::min(M, rA + tileSize);
681 const auto tileSizeClamped = glm::min(tileSize, rB - rA);
682 const size_t layerStride = g->M.x * g->M.y * g->M.z;
683 const auto Nt =
static_cast<uint32_t
>(g->Nt);
688 auto * scratch = g->scratchAcquire(4 *
sizeof(
int) * (scratchSize.x * scratchSize.y * scratchSize.z + 1 + 4));
689 auto * s =
reinterpret_cast<uint32_t*
>(scratch->data());
695 for (; tOff + 4 < Nt; tOff += 8) {
696 auto lanes = min(8u, Nt - tOff);
697 for (uint32_t i = 0; i < 8; i++) {
698 T[i] = thresholds[tOff + min(i, lanes - 1)];
700 setInitialBit8(
reinterpret_cast<uint64_t*
>(s) + 8, fieldDim, tileSize, rA, g->gridA, T, field, exteriorIsLess);
701 merge(s, 16, 2, tileSize.x, tileSize.y, tileSize.z);
702 countActiveCells8(Nc, s, tileSizeClamped, tileSize);
703 for (uint32_t l = 0; l < lanes; l++) {
704 Oc[l] = g->cellOffsets[l].fetch_add(Nc[l]);
706 calculateOffsets8(g->cellMap, g->activeCellCases, g->activeCellIndices,
707 reinterpret_cast<uint64_t*
>(s),
708 tileSizeClamped, tileSize, rA, M,
709 Oc, layerStride, tOff, lanes);
713 for (; tOff < Nt; tOff += 4) {
714 auto lanes = min(4u, Nt - tOff);
715 for (uint32_t i = 0; i < 4; i++) {
716 T[i] = thresholds[tOff + min(i, lanes - 1)];
718 setInitialBit4(s + 16, fieldDim, tileSize, rA, g->gridA, T, field, exteriorIsLess);
719 merge(
reinterpret_cast<uint32_t*
>(s), 16, 1, tileSize.x, tileSize.y, tileSize.z);
720 countActiveCells4(Nc, s, tileSizeClamped, tileSize);
721 for (uint32_t l = 0; l < lanes; l++) {
722 Oc[l] = g->cellOffsets[l].fetch_add(Nc[l]);
724 calculateOffsets4(g->cellMap, g->activeCellCases, g->activeCellIndices,
726 tileSizeClamped, tileSize, rA, M,
727 Oc, layerStride, tOff, lanes);
730 g->scratchRelease(scratch);
732 if (g->elapsed_us !=
nullptr) {
733 g->elapsed_us->fetch_add(timer.elapsedMicroseconds());
Contains the Engine, Renderer, resource managers and other systems needed to run Cogs....