Cogs.Core
Analyze_SSE.cpp
1#if !defined(EMSCRIPTEN) && !defined(__APPLE__)
2#include "Context.h"
3#include "Services/Features.h"
4#include "IsoSurfaces_internal.h"
5#include "Platform/Instrumentation.h"
6
7#include "Foundation/Platform/Timer.h"
8
9#include <algorithm>
10#ifdef __linux__
11 #include <x86intrin.h>
12#else
13 #include <intrin.h>
14#endif
15
16using std::min;
17using std::max;
18using glm::uvec3;
19using glm::ivec3;
20
21using namespace Cogs::Core;
23
24namespace {
25
26
27 void determineRunlengths(uint32_t& pad_start, // Amount of out-of-domain padding at start of line.
28 uint32_t& in_skip, // Source pointer adjustment.
29 uint32_t& process, // Number of samples to process.
30 uint32_t& pad_stop,
31 const int32_t in_shift,
32 const uint32_t out_runlength,
33 const uint32_t in_runlength)
34 {
35 if (in_shift < 0) {
36 pad_start = min(out_runlength, static_cast<uint32_t>(-in_shift));
37 in_skip = 0;
38 process = static_cast<uint32_t>(min(out_runlength - pad_start, // cannot be negative
39 in_runlength));
40 }
41 else {
42 pad_start = 0;
43 in_skip = static_cast<uint32_t>(in_shift);
44 process = static_cast<uint32_t>(min(max(0u, in_runlength - in_shift), // x_in_runlength < x_in_shift
45 out_runlength));
46 }
47 assert(pad_start <= out_runlength);
48 assert(process <= out_runlength);
49
50 pad_stop = out_runlength - pad_start - process;
51 }
52
53
54 inline uint32_t* writePadding(uint32_t* dst, uint32_t count, const __m128i out)
55 {
56 uint32_t i = 0;
57 while (i + 4 <= count)
58 {
59 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + i), out); i += 4;
60 }
61 if (i + 2 <= count) {
62 _mm_storel_epi64(reinterpret_cast<__m128i*>(dst + i), out); i += 2;
63 }
64 if (i != count) {
65 dst[i] = out.m128i_u32[0]; i++;
66 }
67 return dst + i;
68 }
69
70 template<typename Type>
71 inline __m128 fetch2(const Type * mem);
72
73 template<>
74 inline __m128 fetch2<float>(const float * mem)
75 {
76 // Fetch two floats
77 return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(mem)));
78 }
79
80 template<>
81 inline __m128 fetch2<uint16_t>(const uint16_t * mem)
82 {
83 __m128i t = _mm_setzero_si128();
84 t.m128i_u32[0] = *reinterpret_cast<const uint32_t*>(mem);
85 t = _mm_unpacklo_epi16(t, _mm_setzero_si128());
86 return _mm_cvtepi32_ps(t);
87 }
88
89 template<typename Type>
90 inline __m128 fetch4(const Type * mem);
91
92 template<>
93 inline __m128 fetch4<float>(const float * mem)
94 {
95 return _mm_loadu_ps(mem);
96 }
97
98 template<>
99 inline __m128 fetch4<uint16_t>(const uint16_t * mem)
100 {
101 __m128i t = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(mem));
102 t = _mm_unpacklo_epi16(t, _mm_setzero_si128());
103 return _mm_cvtepi32_ps(t);
104 }
105
106 template<typename Type>
107 void setInitialBit8(uint64_t* s,
108 const uvec3& fieldDim,
109 const uvec3& tileSize,
110 const uvec3& rA,
111 const ivec3& gridA, // Might be negative
112 const Type* T,
113 const Type* field,
114 const bool exteriorIsLess)
115 {
116 const uvec3 scratchSize = tileSize + uvec3(1);
117
118 uint32_t x_pad_start, x_in_skip, x_process, x_pad_stop;
119 determineRunlengths(x_pad_start, x_in_skip, x_process, x_pad_stop,
120 rA.x + gridA.x, scratchSize.x, fieldDim.x);
121
122 uint32_t y_pad_start, y_in_skip, y_process, y_pad_stop;
123 determineRunlengths(y_pad_start, y_in_skip, y_process, y_pad_stop,
124 rA.y + gridA.y, scratchSize.y, fieldDim.y);
125
126 uint32_t z_pad_start, z_in_skip, z_process, z_pad_stop;
127 determineRunlengths(z_pad_start, z_in_skip, z_process, z_pad_stop,
128 rA.z + gridA.z, scratchSize.z, fieldDim.z);
129
130 uint32_t * dst = reinterpret_cast<uint32_t*>(s);
131
132 const __m128i ones = _mm_set1_epi8(1);
133 const __m128i ext = _mm_set1_epi32(exteriorIsLess ? ~0 : 0);
134 const __m128i out = _mm_and_si128(ext, ones);
135
136 const __m128 t0 = _mm_set_ps(T[3], T[2], T[1], T[0]);
137 const __m128 t1 = _mm_set_ps(T[7], T[6], T[5], T[4]);
138
139 dst = writePadding(dst, 2 * scratchSize.y * scratchSize.x * z_pad_start, out);
140 for (uint32_t k = 0; k < z_process; k++) {
141 dst = writePadding(dst, 2 * scratchSize.x*y_pad_start, out);
142 for (uint32_t j = 0; j < y_process; j++) {
143 dst = writePadding(dst, 2 * x_pad_start, out);
144
145 const auto * src = field + static_cast<int32_t>(((k + z_in_skip)*fieldDim.y + j + y_in_skip)*fieldDim.x) + x_in_skip;
146 for (uint32_t i = 0; i < (x_process >> 1); i++) {
147 __m128 v = fetch2(src); src += 2;
148
149 __m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0));
150 __m128i m0 = _mm_castps_si128(_mm_cmplt_ps(v0, t0));
151 __m128i m1 = _mm_castps_si128(_mm_cmplt_ps(v0, t1));
152 __m128i mm0 = _mm_packs_epi32(m0, m1);
153
154 __m128 v1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1));
155 __m128i m2 = _mm_castps_si128(_mm_cmplt_ps(v1, t0));
156 __m128i m3 = _mm_castps_si128(_mm_cmplt_ps(v1, t1));
157 __m128i mm1 = _mm_packs_epi32(m2, m3);
158
159 __m128i m = _mm_packs_epi16(mm0, mm1);
160 __m128i b = _mm_and_si128(m, ones);
161 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), b);
162 dst += 4;
163 }
164 if (x_process & 1) {
165 __m128 v = _mm_set1_ps(*src++);
166 __m128i m0 = _mm_castps_si128(_mm_cmplt_ps(v, t0));
167 __m128i m1 = _mm_castps_si128(_mm_cmplt_ps(v, t1));
168 __m128i mm0 = _mm_packs_epi32(m0, m1);
169
170 __m128i m = _mm_packs_epi16(mm0, _mm_setzero_si128());
171 __m128i b = _mm_and_si128(m, ones);
172
173 _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), b);
174 dst += 2;
175 }
176 dst = writePadding(dst, 2 * x_pad_stop, out);
177 }
178 dst = writePadding(dst, 2 * scratchSize.x*y_pad_stop, out);
179 }
180 dst = writePadding(dst, 2 * scratchSize.y * scratchSize.x * z_pad_stop, out);
181 }
182
183 template<typename Type>
184 void setInitialBit4(uint32_t* s,
185 const uvec3& fieldDim,
186 const uvec3& tileSize,
187 const uvec3& rA,
188 const ivec3& gridA, // Might be negative
189 const Type* T,
190 const Type* field,
191 const bool exteriorIsLess)
192 {
193
194 const uvec3 scratchSize = tileSize + uvec3(1);
195
196 uint32_t x_pad_start, x_in_skip, x_process, x_pad_stop;
197 determineRunlengths(x_pad_start, x_in_skip, x_process, x_pad_stop,
198 rA.x + gridA.x, scratchSize.x, fieldDim.x);
199
200 uint32_t y_pad_start, y_in_skip, y_process, y_pad_stop;
201 determineRunlengths(y_pad_start, y_in_skip, y_process, y_pad_stop,
202 rA.y + gridA.y, scratchSize.y, fieldDim.y);
203
204 uint32_t z_pad_start, z_in_skip, z_process, z_pad_stop;
205 determineRunlengths(z_pad_start, z_in_skip, z_process, z_pad_stop,
206 rA.z + gridA.z, scratchSize.z, fieldDim.z);
207
208 uint32_t * dst = s;
209
210 const __m128i ones = _mm_set1_epi8(1);
211 const __m128i ext = _mm_set1_epi32(exteriorIsLess ? ~0 : 0);
212 const __m128i out = _mm_and_si128(ext, ones);
213
214 const __m128 t0 = _mm_set_ps(T[3], T[2], T[1], T[0]);
215
216 dst = writePadding(dst, scratchSize.y * scratchSize.x * z_pad_start, out);
217 for (uint32_t k = 0; k < z_process; k++) {
218 dst = writePadding(dst, scratchSize.x*y_pad_start, out);
219 for (uint32_t j = 0; j < y_process; j++) {
220 dst = writePadding(dst, x_pad_start, out);
221
222 const auto * src = field + static_cast<int32_t>(((k + z_in_skip)*fieldDim.y + j + y_in_skip)*fieldDim.x) + x_in_skip;
223 for (uint32_t i = 0; i < (x_process >> 2); i++) {
224 __m128 v = fetch4(src); src += 4;
225
226 __m128i m0 = _mm_castps_si128(_mm_cmplt_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), t0));
227 __m128i m1 = _mm_castps_si128(_mm_cmplt_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), t0));
228 __m128i mm0 = _mm_packs_epi32(m0, m1);
229
230 __m128i m2 = _mm_castps_si128(_mm_cmplt_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), t0));
231 __m128i m3 = _mm_castps_si128(_mm_cmplt_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), t0));
232 __m128i mm1 = _mm_packs_epi32(m2, m3);
233
234 __m128i m = _mm_packs_epi16(mm0, mm1);
235 __m128i b = _mm_and_si128(m, ones);
236 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), b);
237 dst += 4;
238 }
239 for (uint32_t i = 0; i < (x_process & 3); i++) {
240 __m128 v = _mm_set1_ps(*src++);
241 __m128i m0 = _mm_castps_si128(_mm_cmplt_ps(v, t0));
242 __m128i mm0 = _mm_packs_epi32(m0, _mm_setzero_si128());
243 __m128i m = _mm_packs_epi16(mm0, _mm_setzero_si128());
244 __m128i b = _mm_and_si128(m, ones);
245 *dst++ = b.m128i_u32[0];
246 }
247 dst = writePadding(dst, x_pad_stop, out);
248 }
249 dst = writePadding(dst, scratchSize.x*y_pad_stop, out);
250 }
251 dst = writePadding(dst, scratchSize.y * scratchSize.x * z_pad_stop, out);
252 }
253
254 void xMerge(uint32_t*s, const uint32_t uint32sInElement, const uint32_t elements, const uint32_t inOutShift)
255 {
256 const uint32_t* q = s + uint32sInElement;
257 uint32_t i = 0;
258 for (; i + 4 < uint32sInElement*elements; i += 4) {
259 __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i + inOutShift));
260 __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i*>(q + i + inOutShift));
261 __m128i c = _mm_or_si128(a, _mm_slli_epi32(b, 1));
262 _mm_storeu_si128(reinterpret_cast<__m128i*>(s + i), c);
263 }
264 for (; i < uint32sInElement*elements; i++) {
265 s[i] = s[i + inOutShift] | (q[i + inOutShift] << 1);
266 }
267 }
268
269 void yMerge(uint32_t* s, const uint32_t uint32sInARow, const uint32_t rows)
270 {
271 const uint32_t* q = s + uint32sInARow;
272 uint32_t i = 0;
273 for (; i + 4 < uint32sInARow*rows; i += 4) {
274 __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));
275 __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i*>(q + i));
276 __m128i c = _mm_or_si128(a, _mm_slli_epi32(b, 2));
277 _mm_storeu_si128(reinterpret_cast<__m128i*>(s + i), c);
278 }
279 for (; i < uint32sInARow*rows; i++) {
280 s[i] = s[i] | (q[i] << 2);
281 }
282
283 }
284
285 void zMerge(uint32_t* s, const uint32_t uint32sInASlice, const uint32_t slices)
286 {
287 __m128i zero = _mm_setzero_si128();
288 __m128i ones = _mm_cmpeq_epi32(zero, zero);
289 const uint32_t* q = s + uint32sInASlice;
290 uint32_t i = 0;
291 for (; i + 4 < uint32sInASlice*slices; i += 4) {
292 __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));
293 __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i*>(q + i));
294 __m128i c = _mm_or_si128(a, _mm_slli_epi32(b, 4));
295 __m128i m = _mm_cmpeq_epi8(c, ones);
296 c = _mm_andnot_si128(m, c);
297 _mm_storeu_si128(reinterpret_cast<__m128i*>(s + i), c);
298 }
299 for (; i < uint32sInASlice*slices; i++) {
300 __m128i a = _mm_setzero_si128(); a.m128i_u32[0] = s[i];
301 __m128i b = _mm_setzero_si128(); b.m128i_u32[0] = q[i];
302 __m128i c = _mm_or_si128(a, _mm_slli_epi32(b, 4));
303 __m128i m = _mm_cmpeq_epi8(c, ones);
304 c = _mm_andnot_si128(m, c);
305 s[i] = c.m128i_u32[0];
306 }
307 }
308
309 void countActiveCells16(uint32_t* Nc,
310 const uint32_t* s,
311 const uvec3& tileSizeClamped,
312 const uvec3& tileSize)
313 {
314 const uvec3 scratchSize = tileSize + uvec3(1);
315 __m128i Nc_[4] = {
316 _mm_setzero_si128(),
317 _mm_setzero_si128(),
318 _mm_setzero_si128(),
319 _mm_setzero_si128()
320 };
321 __m128i ones4 = _mm_set1_epi32(1);
322 __m128i zero = _mm_setzero_si128();
323 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
324 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
325 const auto * src = s + 4 * (k*scratchSize.y + j)*scratchSize.x;
326 for (uint32_t i = 0; i < tileSizeClamped.x; i++) {
327 __m128i code16_0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + i);
328 __m128i code8_0 = _mm_unpacklo_epi8(code16_0, zero);
329 __m128i code8_1 = _mm_unpackhi_epi8(code16_0, zero);
330 __m128i code4_0 = _mm_unpacklo_epi16(code8_0, zero);
331 __m128i code4_1 = _mm_unpackhi_epi16(code8_0, zero);
332 __m128i code4_2 = _mm_unpacklo_epi16(code8_1, zero);
333 __m128i code4_3 = _mm_unpackhi_epi16(code8_1, zero);
334 Nc_[0] = _mm_add_epi32(Nc_[0], _mm_andnot_si128(_mm_cmpeq_epi32(code4_0, zero), ones4));
335 Nc_[1] = _mm_add_epi32(Nc_[1], _mm_andnot_si128(_mm_cmpeq_epi32(code4_1, zero), ones4));
336 Nc_[2] = _mm_add_epi32(Nc_[2], _mm_andnot_si128(_mm_cmpeq_epi32(code4_2, zero), ones4));
337 Nc_[3] = _mm_add_epi32(Nc_[3], _mm_andnot_si128(_mm_cmpeq_epi32(code4_3, zero), ones4));
338 }
339 }
340 }
341 for (uint32_t l = 0; l < 16; l++) {
342 Nc[l] = Nc_[l / 4].m128i_i32[l % 4];
343 }
344 }
345
346 void countActiveCells8(uint32_t* Nc,
347 const uint32_t* s,
348 const uvec3& tileSizeClamped,
349 const uvec3& tileSize)
350 {
351 const uvec3 scratchSize = tileSize + uvec3(1);
352 __m128i Nc_[2] = {
353 _mm_setzero_si128(),
354 _mm_setzero_si128()
355 };
356 __m128i ones4 = _mm_set1_epi32(1);
357 __m128i zero = _mm_setzero_si128();
358 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
359 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
360 const auto * src = s + 2 * (k*scratchSize.y + j)*scratchSize.x;
361
362 uint32_t i = 0;
363 for (; i + 1 < tileSizeClamped.x; i += 2) {
364 __m128i code16_0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + 2 * i));
365 __m128i code8_0 = _mm_unpacklo_epi8(code16_0, zero);
366 __m128i code8_1 = _mm_unpackhi_epi8(code16_0, zero);
367
368 __m128i code4_0 = _mm_unpacklo_epi16(code8_0, zero);
369 __m128i code4_2 = _mm_unpacklo_epi16(code8_1, zero);
370 __m128i sum03 = _mm_add_epi32(_mm_andnot_si128(_mm_cmpeq_epi32(code4_0, zero), ones4),
371 _mm_andnot_si128(_mm_cmpeq_epi32(code4_2, zero), ones4));
372 Nc_[0] = _mm_add_epi32(Nc_[0], sum03);
373
374 __m128i code4_1 = _mm_unpackhi_epi16(code8_0, zero);
375 __m128i code4_3 = _mm_unpackhi_epi16(code8_1, zero);
376 __m128i sum47 = _mm_add_epi32(_mm_andnot_si128(_mm_cmpeq_epi32(code4_1, zero), ones4),
377 _mm_andnot_si128(_mm_cmpeq_epi32(code4_3, zero), ones4));
378 Nc_[1] = _mm_add_epi32(Nc_[1], sum47);
379 }
380 for (; i < tileSizeClamped.x; i++) {
381 __m128i code16_0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src + 2 * i));
382 __m128i code8_0 = _mm_unpacklo_epi8(code16_0, zero);
383 __m128i code4_0 = _mm_unpacklo_epi16(code8_0, zero);
384 __m128i code4_1 = _mm_unpackhi_epi16(code8_0, zero);
385 Nc_[0] = _mm_add_epi32(Nc_[0], _mm_andnot_si128(_mm_cmpeq_epi32(code4_0, zero), ones4));
386 Nc_[1] = _mm_add_epi32(Nc_[1], _mm_andnot_si128(_mm_cmpeq_epi32(code4_1, zero), ones4));
387 }
388 }
389 }
390 for (uint32_t l = 0; l < 8; l++) {
391 Nc[l] = Nc_[l / 4].m128i_i32[l % 4];
392 }
393 }
394
395 void countActiveCells4(uint32_t* Nc,
396 const uint32_t* s,
397 const uvec3& tileSizeClamped,
398 const uvec3& tileSize)
399 {
400 const uvec3 scratchSize = tileSize + uvec3(1);
401 __m128i Nc_ = _mm_setzero_si128();
402
403 __m128i ones4 = _mm_set1_epi32(1);
404 __m128i zero = _mm_setzero_si128();
405
406 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
407 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
408 const auto * src = s + (k*scratchSize.y + j)*scratchSize.x;
409
410 uint32_t i = 0;
411 for (; i + 4 < tileSizeClamped.x; i += 4) {
412 __m128i code16_0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
413 __m128i code8_0 = _mm_unpacklo_epi8(code16_0, zero);
414 __m128i code4_0 = _mm_unpacklo_epi16(code8_0, zero);
415 __m128i code4_1 = _mm_unpackhi_epi16(code8_0, zero);
416 __m128i sum0 = _mm_add_epi32(_mm_andnot_si128(_mm_cmpeq_epi32(code4_0, zero), ones4),
417 _mm_andnot_si128(_mm_cmpeq_epi32(code4_1, zero), ones4));
418
419 __m128i code8_1 = _mm_unpackhi_epi8(code16_0, zero);
420 __m128i code4_2 = _mm_unpacklo_epi16(code8_1, zero);
421 __m128i code4_3 = _mm_unpackhi_epi16(code8_1, zero);
422 __m128i sum1 = _mm_add_epi32(_mm_andnot_si128(_mm_cmpeq_epi32(code4_2, zero), ones4),
423 _mm_andnot_si128(_mm_cmpeq_epi32(code4_3, zero), ones4));
424
425 Nc_ = _mm_add_epi32(Nc_, _mm_add_epi32(sum0, sum1));
426 }
427 for (; i < tileSizeClamped.x; i++) {
428 __m128i code16_0 = zero;
429 code16_0.m128i_u32[0] = src[i];
430 __m128i code8_0 = _mm_unpacklo_epi8(code16_0, zero);
431 __m128i code4_0 = _mm_unpacklo_epi16(code8_0, zero);
432 Nc_ = _mm_add_epi32(Nc_, _mm_andnot_si128(_mm_cmpeq_epi32(code4_0, zero), ones4));
433 }
434 }
435 }
436 for (uint32_t l = 0; l < 4; l++) {
437 Nc[l] = Nc_.m128i_i32[l];
438 }
439 }
440
441 void calculateOffsets16(int32_t* cellMap,
442 uint8_t* activeCellCases,
443 int32_t* activeCellIndices,
444 const __m128i* s,
445 const uvec3& tileSizeClamped,
446 const uvec3& tileSize,
447 const uvec3& rA,
448 const uvec3& M,
449 const uint32_t* Oc,
450 const size_t layerStride,
451 const uint32_t tOff,
452 const uint32_t lanes)
453 {
454 uint32_t Ni[16];
455 for (uint32_t l = 0; l < lanes; l++) {
456 Ni[l] = 0;
457 }
458
459 const uvec3 scratchSize = tileSize + uvec3(1);
460 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
461 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
462 const auto * src = s + (k*scratchSize.y + j)*scratchSize.x;
463 const auto lineOffset = ((k + rA.z)*M.y + (j + rA.y))*M.x;
464
465 for (uint32_t i = 0; i < tileSizeClamped.x; i++) {
466 __m128i codes = _mm_loadu_si128(src + i);
467
468 for (uint32_t l = 0; l < lanes; l++) {
469 const auto code = codes.m128i_u8[l];
470 if (code == 0) continue;
471
472 const auto t = tOff + l;
473 const auto uncompactedCellIndex = lineOffset + i + rA.x;
474 const auto c = Oc[l] + (Ni[l]++);
475 cellMap[layerStride*t + uncompactedCellIndex] = c;
476 activeCellCases[layerStride*t + c] = code;
477 activeCellIndices[layerStride*t + c] = uncompactedCellIndex;
478 }
479 }
480 }
481 }
482 }
483
484 void calculateOffsets8(int32_t* cellMap,
485 uint8_t* activeCellCases,
486 int32_t* activeCellIndices,
487 const uint64_t* s,
488 const uvec3& tileSizeClamped,
489 const uvec3& tileSize,
490 const uvec3& rA,
491 const uvec3& M,
492 const uint32_t* Oc,
493 const size_t layerStride,
494 const uint32_t tOff,
495 const uint32_t lanes)
496 {
497 uint32_t Ni[8];
498 for (uint32_t l = 0; l < lanes; l++) {
499 Ni[l] = 0;
500 }
501
502 const uvec3 scratchSize = tileSize + uvec3(1);
503 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
504 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
505 const auto * src = s + (k*scratchSize.y + j)*scratchSize.x;
506 const auto lineOffset = ((k + rA.z)*M.y + (j + rA.y))*M.x;
507
508 for (uint32_t i = 0; i < tileSizeClamped.x; i++) {
509 union
510 {
511 uint64_t u64;
512 uint8_t u8[8];
513 } codes;
514 codes.u64 = src[i];
515
516 for (uint32_t l = 0; l < lanes; l++) {
517 const auto code = codes.u8[l];
518 if (code == 0) continue;
519
520 const auto t = tOff + l;
521 const auto uncompactedCellIndex = lineOffset + i + rA.x;
522 const auto c = Oc[l] + (Ni[l]++);
523 cellMap[layerStride*t + uncompactedCellIndex] = c;
524 activeCellCases[layerStride*t + c] = code;
525 activeCellIndices[layerStride*t + c] = uncompactedCellIndex;
526 }
527 }
528 }
529 }
530 }
531
532 void calculateOffsets4(int32_t* cellMap,
533 uint8_t* activeCellCases,
534 int32_t* activeCellIndices,
535 const uint32_t* s,
536 const uvec3& tileSizeClamped,
537 const uvec3& tileSize,
538 const uvec3& rA,
539 const uvec3& M,
540 const uint32_t* Oc,
541 const size_t layerStride,
542 const uint32_t tOff,
543 const uint32_t lanes)
544 {
545 assert(lanes <= 4);
546 uint32_t Ni[4];
547 for (uint32_t l = 0; l < lanes; l++) {
548 Ni[l] = 0;
549 }
550
551 const uvec3 scratchSize = tileSize + uvec3(1);
552 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
553 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
554 const auto * src = s + (k*scratchSize.y + j)*scratchSize.x;
555 const auto lineOffset = ((k + rA.z)*M.y + (j + rA.y))*M.x;
556
557 for (uint32_t i = 0; i < tileSizeClamped.x; i++) {
558 union
559 {
560 uint32_t u32;
561 uint8_t u8[4];
562 } codes;
563 codes.u32 = src[i];
564
565 for (uint32_t l = 0; l < lanes; l++) {
566 const auto code = codes.u8[l];
567 if (code == 0) continue;
568
569 const auto t = tOff + l;
570 const auto uncompactedCellIndex = lineOffset + i + rA.x;
571 const auto c = Oc[l] + (Ni[l]++);
572 cellMap[layerStride*t + uncompactedCellIndex] = c;
573 activeCellCases[layerStride*t + c] = code;
574 activeCellIndices[layerStride*t + c] = uncompactedCellIndex;
575 }
576 }
577 }
578 }
579 }
580
581 void merge(uint32_t* s,
582 const uint32_t uint32XShift,
583 const uint32_t uint32sPerElement,
584 const uint32_t tileX,
585 const uint32_t tileY,
586 const uint32_t tileZ)
587 {
588 xMerge(s, uint32sPerElement, (tileX + 1)*(tileY + 1)*(tileZ + 1) - 1, uint32XShift);
589 yMerge(s, uint32sPerElement * (tileX + 1), (tileY + 1)*(tileZ + 1) - 1);
590 zMerge(s, uint32sPerElement * (tileX + 1)*(tileY + 1), tileZ);
591 }
592
593}
594
595
596void Cogs::Core::IsoSurfaces::analyzeTile_f32_SSE(AnalyzeGlobalState* g, const glm::ivec3 id)
597{
598 CpuInstrumentationScope(SCOPE_ISOSURFACES, "analyzeTile_f32_SSE");
599 auto timer = Timer::startNew();
600
601 const float* field = (const float*)g->field;
602 const float* thresholds = (const float*)g->thresholds;
603
604 const uvec3 fieldDim = uvec3(g->fieldDim);
605 const uvec3 tileSize = uvec3(g->tileSize);
606 const uvec3 scratchSize = tileSize + uvec3(1);
607 const uvec3 M = uvec3(g->M);
608 const auto exteriorIsLess = g->exteriorIsLess;
609 uvec3 rA = tileSize * uvec3(id);
610 uvec3 rB = glm::min(M, rA + tileSize);
611 const auto tileSizeClamped = glm::min(tileSize, rB - rA);
612 const size_t layerStride = g->M.x * g->M.y * g->M.z;
613 const auto Nt = static_cast<uint32_t>(g->Nt);
614
615 float T[16];
616 uint32_t Nc[16];
617 uint32_t Oc[16];
618 auto * scratch = g->scratchAcquire(4 * sizeof(int) * (scratchSize.x * scratchSize.y * scratchSize.z + 1 + 4));
619 auto * s = reinterpret_cast<uint32_t*>(scratch->data());
620
621 uint32_t tOff = 0;
622
623 // While there are 5 or more surfaces, do 8 at a time.
624 for (; tOff + 4 < Nt; tOff += 8) {
625 auto lanes = min(8u, Nt - tOff);
626 for (uint32_t i = 0; i < 8; i++) {
627 T[i] = thresholds[tOff + min(i, lanes - 1)];
628 }
629 setInitialBit8(reinterpret_cast<uint64_t*>(s) + 8, fieldDim, tileSize, rA, g->gridA, T, field, exteriorIsLess);
630 merge(s, 16, 2, tileSize.x, tileSize.y, tileSize.z);
631 countActiveCells8(Nc, s, tileSizeClamped, tileSize);
632 for (uint32_t l = 0; l < lanes; l++) {
633 Oc[l] = g->cellOffsets[l].fetch_add(Nc[l]);
634 }
635 calculateOffsets8(g->cellMap, g->activeCellCases, g->activeCellIndices,
636 reinterpret_cast<uint64_t*>(s),
637 tileSizeClamped, tileSize, rA, M,
638 Oc, layerStride, tOff, lanes);
639 }
640
641 // And do the rest 4 at a time.
642 for (; tOff < Nt; tOff += 4) {
643 auto lanes = min(4u, Nt - tOff);
644 for (uint32_t i = 0; i < 4; i++) {
645 T[i] = thresholds[tOff + min(i, lanes - 1)];
646 }
647 setInitialBit4(s + 16, fieldDim, tileSize, rA, g->gridA, T, field, exteriorIsLess);
648 merge(reinterpret_cast<uint32_t*>(s), 16, 1, tileSize.x, tileSize.y, tileSize.z);
649 countActiveCells4(Nc, s, tileSizeClamped, tileSize);
650 for (uint32_t l = 0; l < lanes; l++) {
651 Oc[l] = g->cellOffsets[l].fetch_add(Nc[l]);
652 }
653 calculateOffsets4(g->cellMap, g->activeCellCases, g->activeCellIndices,
654 s,
655 tileSizeClamped, tileSize, rA, M,
656 Oc, layerStride, tOff, lanes);
657 }
658
659 g->scratchRelease(scratch);
660
661 if (g->elapsed_us != nullptr) {
662 g->elapsed_us->fetch_add(timer.elapsedMicroseconds());
663 }
664}
665
666void Cogs::Core::IsoSurfaces::analyzeTile_u16_SSE(AnalyzeGlobalState* g, const glm::ivec3 id)
667{
668 CpuInstrumentationScope(SCOPE_ISOSURFACES, "analyzeTile_u16_SSE");
669 auto timer = Timer::startNew();
670
671 const uint16_t* field = (const uint16_t*)g->field;
672 const uint16_t* thresholds = (const uint16_t*)g->thresholds;
673
674 const uvec3 fieldDim = uvec3(g->fieldDim);
675 const uvec3 tileSize = uvec3(g->tileSize);
676 const uvec3 scratchSize = tileSize + uvec3(1);
677 const uvec3 M = uvec3(g->M);
678 const auto exteriorIsLess = g->exteriorIsLess;
679 uvec3 rA = tileSize * uvec3(id);
680 uvec3 rB = glm::min(M, rA + tileSize);
681 const auto tileSizeClamped = glm::min(tileSize, rB - rA);
682 const size_t layerStride = g->M.x * g->M.y * g->M.z;
683 const auto Nt = static_cast<uint32_t>(g->Nt);
684
685 uint16_t T[16];
686 uint32_t Nc[16];
687 uint32_t Oc[16];
688 auto * scratch = g->scratchAcquire(4 * sizeof(int) * (scratchSize.x * scratchSize.y * scratchSize.z + 1 + 4));
689 auto * s = reinterpret_cast<uint32_t*>(scratch->data());
690
691 uint32_t tOff = 0;
692
693
694 // While there are 5 or more surfaces, do 8 at a time.
695 for (; tOff + 4 < Nt; tOff += 8) {
696 auto lanes = min(8u, Nt - tOff);
697 for (uint32_t i = 0; i < 8; i++) {
698 T[i] = thresholds[tOff + min(i, lanes - 1)];
699 }
700 setInitialBit8(reinterpret_cast<uint64_t*>(s) + 8, fieldDim, tileSize, rA, g->gridA, T, field, exteriorIsLess);
701 merge(s, 16, 2, tileSize.x, tileSize.y, tileSize.z);
702 countActiveCells8(Nc, s, tileSizeClamped, tileSize);
703 for (uint32_t l = 0; l < lanes; l++) {
704 Oc[l] = g->cellOffsets[l].fetch_add(Nc[l]);
705 }
706 calculateOffsets8(g->cellMap, g->activeCellCases, g->activeCellIndices,
707 reinterpret_cast<uint64_t*>(s),
708 tileSizeClamped, tileSize, rA, M,
709 Oc, layerStride, tOff, lanes);
710 }
711
712 // And do the rest 4 at a time.
713 for (; tOff < Nt; tOff += 4) {
714 auto lanes = min(4u, Nt - tOff);
715 for (uint32_t i = 0; i < 4; i++) {
716 T[i] = thresholds[tOff + min(i, lanes - 1)];
717 }
718 setInitialBit4(s + 16, fieldDim, tileSize, rA, g->gridA, T, field, exteriorIsLess);
719 merge(reinterpret_cast<uint32_t*>(s), 16, 1, tileSize.x, tileSize.y, tileSize.z);
720 countActiveCells4(Nc, s, tileSizeClamped, tileSize);
721 for (uint32_t l = 0; l < lanes; l++) {
722 Oc[l] = g->cellOffsets[l].fetch_add(Nc[l]);
723 }
724 calculateOffsets4(g->cellMap, g->activeCellCases, g->activeCellIndices,
725 s,
726 tileSizeClamped, tileSize, rA, M,
727 Oc, layerStride, tOff, lanes);
728 }
729
730 g->scratchRelease(scratch);
731
732 if (g->elapsed_us != nullptr) {
733 g->elapsed_us->fetch_add(timer.elapsedMicroseconds());
734 }
735}
736#endif
Contains the Engine, Renderer, resource managers and other systems needed to run Cogs....