Cogs.Core
Analyze_avx2.cpp
1#if !defined(EMSCRIPTEN) && !defined(__APPLE__)
2
3#include "Context.h"
4#include "Services/Features.h"
5#include "IsoSurfaces_internal.h"
6#include "Platform/Instrumentation.h"
7
8#include "Foundation/Platform/Timer.h"
9
10#ifndef __AVX2__
11 static_assert(false, "This compile unit must be compiled with AVX2");
12#endif
13
14#include <algorithm>
15
16#ifdef __linux__
17 #include <x86intrin.h>
18#else
19 #include <intrin.h>
20#endif
21
22using std::min;
23using std::max;
24using glm::uvec3;
25using glm::ivec3;
26
27using namespace Cogs::Core;
29
30namespace {
31
32 void determineRunlengths(uint32_t& pad_start, // Amount of out-of-domain padding at start of line.
33 uint32_t& in_skip, // Source pointer adjustment.
34 uint32_t& process, // Number of samples to process.
35 uint32_t& pad_stop,
36 const int32_t in_shift,
37 const uint32_t out_runlength,
38 const uint32_t in_runlength)
39 {
40 if (in_shift < 0) {
41 pad_start = min(out_runlength, static_cast<uint32_t>(-in_shift));
42 in_skip = 0;
43 process = static_cast<uint32_t>(min(out_runlength - pad_start, // cannot be negative
44 in_runlength));
45 }
46 else {
47 pad_start = 0;
48 in_skip = static_cast<uint32_t>(in_shift);
49 process = static_cast<uint32_t>(min(max(0u, in_runlength - in_shift), // x_in_runlength < x_in_shift
50 out_runlength));
51 }
52 assert(pad_start <= out_runlength);
53 assert(process <= out_runlength);
54
55 pad_stop = out_runlength - pad_start - process;
56 }
57
58
59 inline uint32_t* writePadding(uint32_t* dst, uint32_t count, const __m128i out)
60 {
61 uint32_t i = 0;
62 while (i + 4 <= count)
63 {
64 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + i), out); i += 4;
65 }
66 if (i + 2 <= count) {
67 _mm_storel_epi64(reinterpret_cast<__m128i*>(dst + i), out); i += 2;
68 }
69 if (i != count) {
70 dst[i] = out.m128i_u32[0]; i++;
71 }
72 return dst + i;
73 }
74
75 template<typename Type>
76 inline __m128 fetch2(const Type * mem);
77
78 template<>
79 inline __m128 fetch2<float>(const float * mem)
80 {
81 // Fetch two floats
82 return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(mem)));
83 }
84
85 template<>
86 inline __m128 fetch2<uint16_t>(const uint16_t * mem)
87 {
88 __m128i t = _mm_setzero_si128();
89 t.m128i_u32[0] = *reinterpret_cast<const uint32_t*>(mem);
90 t = _mm_unpacklo_epi16(t, _mm_setzero_si128());
91 return _mm_cvtepi32_ps(t);
92 }
93
94 template<typename Type>
95 inline __m128 fetch4(const Type * mem);
96
97 template<>
98 inline __m128 fetch4<float>(const float * mem)
99 {
100 return _mm_loadu_ps(mem);
101 }
102
103 template<>
104 inline __m128 fetch4<uint16_t>(const uint16_t * mem)
105 {
106 __m128i t = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(mem));
107 t = _mm_unpacklo_epi16(t, _mm_setzero_si128());
108 return _mm_cvtepi32_ps(t);
109 }
110
111 template<typename Type>
112 void setInitialBit8(uint64_t* s,
113 const uvec3& fieldDim,
114 const uvec3& tileSize,
115 const uvec3& rA,
116 const ivec3& gridA, // Might be negative
117 const Type* T,
118 const Type* field,
119 const bool exteriorIsLess)
120 {
121 const uvec3 scratchSize = tileSize + uvec3(1);
122
123 uint32_t x_pad_start, x_in_skip, x_process, x_pad_stop;
124 determineRunlengths(x_pad_start, x_in_skip, x_process, x_pad_stop,
125 rA.x + gridA.x, scratchSize.x, fieldDim.x);
126
127 uint32_t y_pad_start, y_in_skip, y_process, y_pad_stop;
128 determineRunlengths(y_pad_start, y_in_skip, y_process, y_pad_stop,
129 rA.y + gridA.y, scratchSize.y, fieldDim.y);
130
131 uint32_t z_pad_start, z_in_skip, z_process, z_pad_stop;
132 determineRunlengths(z_pad_start, z_in_skip, z_process, z_pad_stop,
133 rA.z + gridA.z, scratchSize.z, fieldDim.z);
134
135 uint32_t * dst = reinterpret_cast<uint32_t*>(s);
136
137 const __m128i ones = _mm_set1_epi8(1);
138 const __m128i ext = _mm_set1_epi32(exteriorIsLess ? ~0 : 0);
139 const __m128i out = _mm_and_si128(ext, ones);
140
141 const __m128 t0 = _mm_set_ps(T[3], T[2], T[1], T[0]);
142 const __m128 t1 = _mm_set_ps(T[7], T[6], T[5], T[4]);
143
144 dst = writePadding(dst, 2 * scratchSize.y * scratchSize.x * z_pad_start, out);
145 for (uint32_t k = 0; k < z_process; k++) {
146 dst = writePadding(dst, 2 * scratchSize.x*y_pad_start, out);
147 for (uint32_t j = 0; j < y_process; j++) {
148 dst = writePadding(dst, 2 * x_pad_start, out);
149
150 const auto * src = field + static_cast<int32_t>(((k + z_in_skip)*fieldDim.y + j + y_in_skip)*fieldDim.x) + x_in_skip;
151 for (uint32_t i = 0; i < (x_process >> 1); i++) {
152 __m128 v = fetch2(src); src += 2;
153
154 __m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0));
155 __m128i m0 = _mm_castps_si128(_mm_cmplt_ps(v0, t0));
156 __m128i m1 = _mm_castps_si128(_mm_cmplt_ps(v0, t1));
157 __m128i mm0 = _mm_packs_epi32(m0, m1);
158
159 __m128 v1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1));
160 __m128i m2 = _mm_castps_si128(_mm_cmplt_ps(v1, t0));
161 __m128i m3 = _mm_castps_si128(_mm_cmplt_ps(v1, t1));
162 __m128i mm1 = _mm_packs_epi32(m2, m3);
163
164 __m128i m = _mm_packs_epi16(mm0, mm1);
165 __m128i b = _mm_and_si128(m, ones);
166 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), b);
167 dst += 4;
168 }
169 if (x_process & 1) {
170 __m128 v = _mm_set1_ps(*src++);
171 __m128i m0 = _mm_castps_si128(_mm_cmplt_ps(v, t0));
172 __m128i m1 = _mm_castps_si128(_mm_cmplt_ps(v, t1));
173 __m128i mm0 = _mm_packs_epi32(m0, m1);
174
175 __m128i m = _mm_packs_epi16(mm0, _mm_setzero_si128());
176 __m128i b = _mm_and_si128(m, ones);
177
178 _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), b);
179 dst += 2;
180 }
181 dst = writePadding(dst, 2 * x_pad_stop, out);
182 }
183 dst = writePadding(dst, 2 * scratchSize.x*y_pad_stop, out);
184 }
185 dst = writePadding(dst, 2 * scratchSize.y * scratchSize.x * z_pad_stop, out);
186 }
187
188 template<typename Type>
189 void setInitialBit4(uint32_t* s,
190 const uvec3& fieldDim,
191 const uvec3& tileSize,
192 const uvec3& rA,
193 const ivec3& gridA, // Might be negative
194 const Type* T,
195 const Type* field,
196 const bool exteriorIsLess)
197 {
198
199 const uvec3 scratchSize = tileSize + uvec3(1);
200
201 uint32_t x_pad_start, x_in_skip, x_process, x_pad_stop;
202 determineRunlengths(x_pad_start, x_in_skip, x_process, x_pad_stop,
203 rA.x + gridA.x, scratchSize.x, fieldDim.x);
204
205 uint32_t y_pad_start, y_in_skip, y_process, y_pad_stop;
206 determineRunlengths(y_pad_start, y_in_skip, y_process, y_pad_stop,
207 rA.y + gridA.y, scratchSize.y, fieldDim.y);
208
209 uint32_t z_pad_start, z_in_skip, z_process, z_pad_stop;
210 determineRunlengths(z_pad_start, z_in_skip, z_process, z_pad_stop,
211 rA.z + gridA.z, scratchSize.z, fieldDim.z);
212
213 uint32_t * dst = s;
214
215 const __m128i ones = _mm_set1_epi8(1);
216 const __m128i ext = _mm_set1_epi32(exteriorIsLess ? ~0 : 0);
217 const __m128i out = _mm_and_si128(ext, ones);
218
219 const __m128 t0 = _mm_set_ps(T[3], T[2], T[1], T[0]);
220
221 dst = writePadding(dst, scratchSize.y * scratchSize.x * z_pad_start, out);
222 for (uint32_t k = 0; k < z_process; k++) {
223 dst = writePadding(dst, scratchSize.x*y_pad_start, out);
224 for (uint32_t j = 0; j < y_process; j++) {
225 dst = writePadding(dst, x_pad_start, out);
226
227 const auto * src = field + static_cast<int32_t>(((k + z_in_skip)*fieldDim.y + j + y_in_skip)*fieldDim.x) + x_in_skip;
228 for (uint32_t i = 0; i < (x_process >> 2); i++) {
229 __m128 v = fetch4(src); src += 4;
230
231 __m128i m0 = _mm_castps_si128(_mm_cmplt_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), t0));
232 __m128i m1 = _mm_castps_si128(_mm_cmplt_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), t0));
233 __m128i mm0 = _mm_packs_epi32(m0, m1);
234
235 __m128i m2 = _mm_castps_si128(_mm_cmplt_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), t0));
236 __m128i m3 = _mm_castps_si128(_mm_cmplt_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), t0));
237 __m128i mm1 = _mm_packs_epi32(m2, m3);
238
239 __m128i m = _mm_packs_epi16(mm0, mm1);
240 __m128i b = _mm_and_si128(m, ones);
241 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), b);
242 dst += 4;
243 }
244 for (uint32_t i = 0; i < (x_process & 3); i++) {
245 __m128 v = _mm_set1_ps(*src++);
246 __m128i m0 = _mm_castps_si128(_mm_cmplt_ps(v, t0));
247 __m128i mm0 = _mm_packs_epi32(m0, _mm_setzero_si128());
248 __m128i m = _mm_packs_epi16(mm0, _mm_setzero_si128());
249 __m128i b = _mm_and_si128(m, ones);
250 *dst++ = b.m128i_u32[0];
251 }
252 dst = writePadding(dst, x_pad_stop, out);
253 }
254 dst = writePadding(dst, scratchSize.x*y_pad_stop, out);
255 }
256 dst = writePadding(dst, scratchSize.y * scratchSize.x * z_pad_stop, out);
257 }
258
259 void xMerge(uint32_t*s, const uint32_t uint32sInElement, const uint32_t elements, const uint32_t inOutShift)
260 {
261 const uint32_t* q = s + uint32sInElement;
262 uint32_t i = 0;
263 for (; i + 8 < uint32sInElement*elements; i += 8) {
264 __m256i a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s + i + inOutShift));
265 __m256i b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(q + i + inOutShift));
266 __m256i c = _mm256_or_si256(a, _mm256_slli_epi32(b, 1));
267 _mm256_storeu_si256(reinterpret_cast<__m256i*>(s + i), c);
268 }
269 for (; i < uint32sInElement*elements; i++) {
270 s[i] = s[i + inOutShift] | (q[i + inOutShift] << 1);
271 }
272 }
273
274 void yMerge(uint32_t* s, const uint32_t uint32sInARow, const uint32_t rows)
275 {
276 const uint32_t* q = s + uint32sInARow;
277 uint32_t i = 0;
278 for (; i + 8 < uint32sInARow*rows; i += 8) {
279 __m256i a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s + i));
280 __m256i b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(q + i));
281 __m256i c = _mm256_or_si256(a, _mm256_slli_epi32(b, 2));
282 _mm256_storeu_si256(reinterpret_cast<__m256i*>(s + i), c);
283 }
284 for (; i < uint32sInARow*rows; i++) {
285 s[i] = s[i] | (q[i] << 2);
286 }
287
288 }
289
290 void zMerge(uint32_t* s, const uint32_t uint32sInASlice, const uint32_t slices)
291 {
292 const uint32_t* q = s + uint32sInASlice;
293 uint32_t i = 0;
294
295 __m256i zero256 = _mm256_setzero_si256();
296 __m256i ones256 = _mm256_cmpeq_epi32(zero256, zero256);
297 for (; i + 8 < uint32sInASlice*slices; i += 8) {
298 __m256i a = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s + i));
299 __m256i b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(q + i));
300 __m256i c = _mm256_or_si256(a, _mm256_slli_epi32(b, 4));
301 __m256i m = _mm256_cmpeq_epi8(c, ones256);
302 c = _mm256_andnot_si256(m, c);
303 _mm256_storeu_si256(reinterpret_cast<__m256i*>(s + i), c);
304 }
305 for (; i < uint32sInASlice*slices; i++) {
306 __m128i zero128 = _mm_setzero_si128();
307 __m128i ones128 = _mm_cmpeq_epi32(zero128, zero128);
308 __m128i a = _mm_setzero_si128(); a.m128i_u32[0] = s[i];
309 __m128i b = _mm_setzero_si128(); b.m128i_u32[0] = q[i];
310 __m128i c = _mm_or_si128(a, _mm_slli_epi32(b, 4));
311 __m128i m = _mm_cmpeq_epi8(c, ones128);
312 c = _mm_andnot_si128(m, c);
313 s[i] = c.m128i_u32[0];
314 }
315 }
316
317 void countActiveCells16(uint32_t* Nc,
318 const uint32_t* s,
319 const uvec3& tileSizeClamped,
320 const uvec3& tileSize)
321 {
322 const uvec3 scratchSize = tileSize + uvec3(1);
323 __m128i Nc_[4] = {
324 _mm_setzero_si128(),
325 _mm_setzero_si128(),
326 _mm_setzero_si128(),
327 _mm_setzero_si128()
328 };
329 __m128i ones4 = _mm_set1_epi32(1);
330 __m128i zero = _mm_setzero_si128();
331 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
332 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
333 const auto * src = s + 4 * (k*scratchSize.y + j)*scratchSize.x;
334 for (uint32_t i = 0; i < tileSizeClamped.x; i++) {
335 __m128i code16_0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + i);
336 __m128i code8_0 = _mm_unpacklo_epi8(code16_0, zero);
337 __m128i code8_1 = _mm_unpackhi_epi8(code16_0, zero);
338 __m128i code4_0 = _mm_unpacklo_epi16(code8_0, zero);
339 __m128i code4_1 = _mm_unpackhi_epi16(code8_0, zero);
340 __m128i code4_2 = _mm_unpacklo_epi16(code8_1, zero);
341 __m128i code4_3 = _mm_unpackhi_epi16(code8_1, zero);
342 Nc_[0] = _mm_add_epi32(Nc_[0], _mm_andnot_si128(_mm_cmpeq_epi32(code4_0, zero), ones4));
343 Nc_[1] = _mm_add_epi32(Nc_[1], _mm_andnot_si128(_mm_cmpeq_epi32(code4_1, zero), ones4));
344 Nc_[2] = _mm_add_epi32(Nc_[2], _mm_andnot_si128(_mm_cmpeq_epi32(code4_2, zero), ones4));
345 Nc_[3] = _mm_add_epi32(Nc_[3], _mm_andnot_si128(_mm_cmpeq_epi32(code4_3, zero), ones4));
346 }
347 }
348 }
349 for (uint32_t l = 0; l < 16; l++) {
350 Nc[l] = Nc_[l / 4].m128i_i32[l % 4];
351 }
352 }
353
354 void countActiveCells8(uint32_t* Nc,
355 const uint32_t* s,
356 const uvec3& tileSizeClamped,
357 const uvec3& tileSize)
358 {
359 const uvec3 scratchSize = tileSize + uvec3(1);
360 __m128i Nc_[2] = {
361 _mm_setzero_si128(),
362 _mm_setzero_si128()
363 };
364 __m128i ones4 = _mm_set1_epi32(1);
365 __m128i zero = _mm_setzero_si128();
366 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
367 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
368 const auto * src = s + 2 * (k*scratchSize.y + j)*scratchSize.x;
369
370 uint32_t i = 0;
371 for (; i + 1 < tileSizeClamped.x; i += 2) {
372 __m128i code16_0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + 2 * i));
373 __m128i code8_0 = _mm_unpacklo_epi8(code16_0, zero);
374 __m128i code8_1 = _mm_unpackhi_epi8(code16_0, zero);
375
376 __m128i code4_0 = _mm_unpacklo_epi16(code8_0, zero);
377 __m128i code4_2 = _mm_unpacklo_epi16(code8_1, zero);
378 __m128i sum03 = _mm_add_epi32(_mm_andnot_si128(_mm_cmpeq_epi32(code4_0, zero), ones4),
379 _mm_andnot_si128(_mm_cmpeq_epi32(code4_2, zero), ones4));
380 Nc_[0] = _mm_add_epi32(Nc_[0], sum03);
381
382 __m128i code4_1 = _mm_unpackhi_epi16(code8_0, zero);
383 __m128i code4_3 = _mm_unpackhi_epi16(code8_1, zero);
384 __m128i sum47 = _mm_add_epi32(_mm_andnot_si128(_mm_cmpeq_epi32(code4_1, zero), ones4),
385 _mm_andnot_si128(_mm_cmpeq_epi32(code4_3, zero), ones4));
386 Nc_[1] = _mm_add_epi32(Nc_[1], sum47);
387 }
388 for (; i < tileSizeClamped.x; i++) {
389 __m128i code16_0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src + 2 * i));
390 __m128i code8_0 = _mm_unpacklo_epi8(code16_0, zero);
391 __m128i code4_0 = _mm_unpacklo_epi16(code8_0, zero);
392 __m128i code4_1 = _mm_unpackhi_epi16(code8_0, zero);
393 Nc_[0] = _mm_add_epi32(Nc_[0], _mm_andnot_si128(_mm_cmpeq_epi32(code4_0, zero), ones4));
394 Nc_[1] = _mm_add_epi32(Nc_[1], _mm_andnot_si128(_mm_cmpeq_epi32(code4_1, zero), ones4));
395 }
396 }
397 }
398 for (uint32_t l = 0; l < 8; l++) {
399 Nc[l] = Nc_[l / 4].m128i_i32[l % 4];
400 }
401 }
402
403 void countActiveCells4(uint32_t* Nc,
404 const uint32_t* s,
405 const uvec3& tileSizeClamped,
406 const uvec3& tileSize)
407 {
408 const uvec3 scratchSize = tileSize + uvec3(1);
409 __m128i Nc_ = _mm_setzero_si128();
410
411 __m128i ones4 = _mm_set1_epi32(1);
412 __m128i zero = _mm_setzero_si128();
413
414 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
415 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
416 const auto * src = s + (k*scratchSize.y + j)*scratchSize.x;
417
418 uint32_t i = 0;
419 for (; i + 4 < tileSizeClamped.x; i += 4) {
420 __m128i code16_0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
421 __m128i code8_0 = _mm_unpacklo_epi8(code16_0, zero);
422 __m128i code4_0 = _mm_unpacklo_epi16(code8_0, zero);
423 __m128i code4_1 = _mm_unpackhi_epi16(code8_0, zero);
424 __m128i sum0 = _mm_add_epi32(_mm_andnot_si128(_mm_cmpeq_epi32(code4_0, zero), ones4),
425 _mm_andnot_si128(_mm_cmpeq_epi32(code4_1, zero), ones4));
426
427 __m128i code8_1 = _mm_unpackhi_epi8(code16_0, zero);
428 __m128i code4_2 = _mm_unpacklo_epi16(code8_1, zero);
429 __m128i code4_3 = _mm_unpackhi_epi16(code8_1, zero);
430 __m128i sum1 = _mm_add_epi32(_mm_andnot_si128(_mm_cmpeq_epi32(code4_2, zero), ones4),
431 _mm_andnot_si128(_mm_cmpeq_epi32(code4_3, zero), ones4));
432
433 Nc_ = _mm_add_epi32(Nc_, _mm_add_epi32(sum0, sum1));
434 }
435 for (; i < tileSizeClamped.x; i++) {
436 __m128i code16_0 = zero;
437 code16_0.m128i_u32[0] = src[i];
438 __m128i code8_0 = _mm_unpacklo_epi8(code16_0, zero);
439 __m128i code4_0 = _mm_unpacklo_epi16(code8_0, zero);
440 Nc_ = _mm_add_epi32(Nc_, _mm_andnot_si128(_mm_cmpeq_epi32(code4_0, zero), ones4));
441 }
442 }
443 }
444 for (uint32_t l = 0; l < 4; l++) {
445 Nc[l] = Nc_.m128i_i32[l];
446 }
447 }
448
449 void calculateOffsets16(int32_t* cellMap,
450 uint8_t* activeCellCases,
451 int32_t* activeCellIndices,
452 const __m128i* s,
453 const uvec3& tileSizeClamped,
454 const uvec3& tileSize,
455 const uvec3& rA,
456 const uvec3& M,
457 const uint32_t* Oc,
458 const size_t layerStride,
459 const uint32_t tOff,
460 const uint32_t lanes)
461 {
462 uint32_t Ni[16];
463 for (uint32_t l = 0; l < lanes; l++) {
464 Ni[l] = 0;
465 }
466
467 const uvec3 scratchSize = tileSize + uvec3(1);
468 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
469 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
470 const auto * src = s + (k*scratchSize.y + j)*scratchSize.x;
471 const auto lineOffset = ((k + rA.z)*M.y + (j + rA.y))*M.x;
472
473 for (uint32_t i = 0; i < tileSizeClamped.x; i++) {
474 __m128i codes = _mm_loadu_si128(src + i);
475
476 for (uint32_t l = 0; l < lanes; l++) {
477 const auto code = codes.m128i_u8[l];
478 if (code == 0) continue;
479
480 const auto t = tOff + l;
481 const auto uncompactedCellIndex = lineOffset + i + rA.x;
482 const auto c = Oc[l] + (Ni[l]++);
483 cellMap[layerStride*t + uncompactedCellIndex] = c;
484 activeCellCases[layerStride*t + c] = code;
485 activeCellIndices[layerStride*t + c] = uncompactedCellIndex;
486 }
487 }
488 }
489 }
490 }
491
492 void calculateOffsets8(int32_t* cellMap,
493 uint8_t* activeCellCases,
494 int32_t* activeCellIndices,
495 const uint64_t* s,
496 const uvec3& tileSizeClamped,
497 const uvec3& tileSize,
498 const uvec3& rA,
499 const uvec3& M,
500 const uint32_t* Oc,
501 const size_t layerStride,
502 const uint32_t tOff,
503 const uint32_t lanes)
504 {
505 uint32_t Ni[8];
506 for (uint32_t l = 0; l < lanes; l++) {
507 Ni[l] = 0;
508 }
509
510 const uvec3 scratchSize = tileSize + uvec3(1);
511 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
512 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
513 const auto * src = s + (k*scratchSize.y + j)*scratchSize.x;
514 const auto lineOffset = ((k + rA.z)*M.y + (j + rA.y))*M.x;
515
516 for (uint32_t i = 0; i < tileSizeClamped.x; i++) {
517 union
518 {
519 uint64_t u64;
520 uint8_t u8[8];
521 } codes;
522 codes.u64 = src[i];
523
524 for (uint32_t l = 0; l < lanes; l++) {
525 const auto code = codes.u8[l];
526 if (code == 0) continue;
527
528 const auto t = tOff + l;
529 const auto uncompactedCellIndex = lineOffset + i + rA.x;
530 const auto c = Oc[l] + (Ni[l]++);
531 cellMap[layerStride*t + uncompactedCellIndex] = c;
532 activeCellCases[layerStride*t + c] = code;
533 activeCellIndices[layerStride*t + c] = uncompactedCellIndex;
534 }
535 }
536 }
537 }
538 }
539
540 void calculateOffsets4(int32_t* cellMap,
541 uint8_t* activeCellCases,
542 int32_t* activeCellIndices,
543 const uint32_t* s,
544 const uvec3& tileSizeClamped,
545 const uvec3& tileSize,
546 const uvec3& rA,
547 const uvec3& M,
548 const uint32_t* Oc,
549 const size_t layerStride,
550 const uint32_t tOff,
551 const uint32_t lanes)
552 {
553 assert(lanes <= 4);
554 uint32_t Ni[4];
555 for (uint32_t l = 0; l < lanes; l++) {
556 Ni[l] = 0;
557 }
558
559 const uvec3 scratchSize = tileSize + uvec3(1);
560 for (uint32_t k = 0; k < tileSizeClamped.z; k++) {
561 for (uint32_t j = 0; j < tileSizeClamped.y; j++) {
562 const auto * src = s + (k*scratchSize.y + j)*scratchSize.x;
563 const auto lineOffset = ((k + rA.z)*M.y + (j + rA.y))*M.x;
564
565 for (uint32_t i = 0; i < tileSizeClamped.x; i++) {
566 union
567 {
568 uint32_t u32;
569 uint8_t u8[4];
570 } codes;
571 codes.u32 = src[i];
572
573 for (uint32_t l = 0; l < lanes; l++) {
574 const auto code = codes.u8[l];
575 if (code == 0) continue;
576
577 const auto t = tOff + l;
578 const auto uncompactedCellIndex = lineOffset + i + rA.x;
579 const auto c = Oc[l] + (Ni[l]++);
580 cellMap[layerStride*t + uncompactedCellIndex] = c;
581 activeCellCases[layerStride*t + c] = code;
582 activeCellIndices[layerStride*t + c] = uncompactedCellIndex;
583 }
584 }
585 }
586 }
587 }
588
589 void merge(uint32_t* s,
590 const uint32_t uint32XShift,
591 const uint32_t uint32sPerElement,
592 const uint32_t tileX,
593 const uint32_t tileY,
594 const uint32_t tileZ)
595 {
596 xMerge(s, uint32sPerElement, (tileX + 1)*(tileY + 1)*(tileZ + 1) - 1, uint32XShift);
597 yMerge(s, uint32sPerElement * (tileX + 1), (tileY + 1)*(tileZ + 1) - 1);
598 zMerge(s, uint32sPerElement * (tileX + 1)*(tileY + 1), tileZ);
599 }
600
601}
602
603
604void Cogs::Core::IsoSurfaces::analyzeTile_f32_AVX2(AnalyzeGlobalState* g, const glm::ivec3 id)
605{
606 CpuInstrumentationScope(SCOPE_ISOSURFACES, "analyzeTile_f32_AVX2");
607 auto timer = Timer::startNew();
608
609 const float* field = (const float*)g->field;
610 const float* thresholds = (const float*)g->thresholds;
611
612 const uvec3 fieldDim = uvec3(g->fieldDim);
613 const uvec3 tileSize = uvec3(g->tileSize);
614 const uvec3 scratchSize = tileSize + uvec3(1);
615 const uvec3 M = uvec3(g->M);
616 const auto exteriorIsLess = g->exteriorIsLess;
617 uvec3 rA = tileSize * uvec3(id);
618 uvec3 rB = glm::min(M, rA + tileSize);
619 const auto tileSizeClamped = glm::min(tileSize, rB - rA);
620 const size_t layerStride = g->M.x * g->M.y * g->M.z;
621 const auto Nt = static_cast<uint32_t>(g->Nt);
622
623 float T[16];
624 uint32_t Nc[16];
625 uint32_t Oc[16];
626 auto * scratch = g->scratchAcquire(4 * sizeof(int) * (scratchSize.x * scratchSize.y * scratchSize.z + 1 + 4));
627 auto * s = reinterpret_cast<uint32_t*>(scratch->data());
628
629 uint32_t tOff = 0;
630
631
632 // While there are 5 or more surfaces, do 8 at a time.
633 for (; tOff + 4 < Nt; tOff += 8) {
634 auto lanes = min(8u, Nt - tOff);
635 for (uint32_t i = 0; i < 8; i++) {
636 T[i] = thresholds[tOff + min(i, lanes - 1)];
637 }
638 setInitialBit8(reinterpret_cast<uint64_t*>(s) + 8, fieldDim, tileSize, rA, g->gridA, T, field, exteriorIsLess);
639 merge(s, 16, 2, tileSize.x, tileSize.y, tileSize.z);
640 countActiveCells8(Nc, s, tileSizeClamped, tileSize);
641 for (uint32_t l = 0; l < lanes; l++) {
642 Oc[l] = g->cellOffsets[l].fetch_add(Nc[l]);
643 }
644 calculateOffsets8(g->cellMap, g->activeCellCases, g->activeCellIndices,
645 reinterpret_cast<uint64_t*>(s),
646 tileSizeClamped, tileSize, rA, M,
647 Oc, layerStride, tOff, lanes);
648 }
649
650 // And do the rest 4 at a time.
651 for (; tOff < Nt; tOff += 4) {
652 auto lanes = min(4u, Nt - tOff);
653 for (uint32_t i = 0; i < 4; i++) {
654 T[i] = thresholds[tOff + min(i, lanes - 1)];
655 }
656 setInitialBit4(s + 16, fieldDim, tileSize, rA, g->gridA, T, field, exteriorIsLess);
657 merge(reinterpret_cast<uint32_t*>(s), 16, 1, tileSize.x, tileSize.y, tileSize.z);
658 countActiveCells4(Nc, s, tileSizeClamped, tileSize);
659 for (uint32_t l = 0; l < lanes; l++) {
660 Oc[l] = g->cellOffsets[l].fetch_add(Nc[l]);
661 }
662 calculateOffsets4(g->cellMap, g->activeCellCases, g->activeCellIndices,
663 s,
664 tileSizeClamped, tileSize, rA, M,
665 Oc, layerStride, tOff, lanes);
666 }
667
668 g->scratchRelease(scratch);
669
670 if (g->elapsed_us != nullptr) {
671 g->elapsed_us->fetch_add(timer.elapsedMicroseconds());
672 }
673 _mm256_zeroupper();
674}
675
676
677void Cogs::Core::IsoSurfaces::analyzeTile_u16_AVX2(AnalyzeGlobalState* g, const glm::ivec3 id)
678{
679 CpuInstrumentationScope(SCOPE_ISOSURFACES, "analyzeTile_u16_AVX2");
680 auto timer = Timer::startNew();
681
682 const uint16_t* field = (const uint16_t*)g->field;
683 const uint16_t* thresholds = (const uint16_t*)g->thresholds;
684
685 const uvec3 fieldDim = uvec3(g->fieldDim);
686 const uvec3 tileSize = uvec3(g->tileSize);
687 const uvec3 scratchSize = tileSize + uvec3(1);
688 const uvec3 M = uvec3(g->M);
689 const auto exteriorIsLess = g->exteriorIsLess;
690 uvec3 rA = tileSize * uvec3(id);
691 uvec3 rB = glm::min(M, rA + tileSize);
692 const auto tileSizeClamped = glm::min(tileSize, rB - rA);
693 const size_t layerStride = g->M.x * g->M.y * g->M.z;
694 const auto Nt = static_cast<uint32_t>(g->Nt);
695
696 uint16_t T[16];
697 uint32_t Nc[16];
698 uint32_t Oc[16];
699 auto * scratch = g->scratchAcquire(4 * sizeof(int) * (scratchSize.x * scratchSize.y * scratchSize.z + 1 + 4));
700 auto * s = reinterpret_cast<uint32_t*>(scratch->data());
701
702 uint32_t tOff = 0;
703
704 // While there are 5 or more surfaces, do 8 at a time.
705 for (; tOff + 4 < Nt; tOff += 8) {
706 auto lanes = min(8u, Nt - tOff);
707 for (uint32_t i = 0; i < 8; i++) {
708 T[i] = thresholds[tOff + min(i, lanes - 1)];
709 }
710 setInitialBit8(reinterpret_cast<uint64_t*>(s) + 8, fieldDim, tileSize, rA, g->gridA, T, field, exteriorIsLess);
711 merge(s, 16, 2, tileSize.x, tileSize.y, tileSize.z);
712 countActiveCells8(Nc, s, tileSizeClamped, tileSize);
713 for (uint32_t l = 0; l < lanes; l++) {
714 Oc[l] = g->cellOffsets[l].fetch_add(Nc[l]);
715 }
716 calculateOffsets8(g->cellMap, g->activeCellCases, g->activeCellIndices,
717 reinterpret_cast<uint64_t*>(s),
718 tileSizeClamped, tileSize, rA, M,
719 Oc, layerStride, tOff, lanes);
720 }
721
722 // And do the rest 4 at a time.
723 for (; tOff < Nt; tOff += 4) {
724 auto lanes = min(4u, Nt - tOff);
725 for (uint32_t i = 0; i < 4; i++) {
726 T[i] = thresholds[tOff + min(i, lanes - 1)];
727 }
728 setInitialBit4(s + 16, fieldDim, tileSize, rA, g->gridA, T, field, exteriorIsLess);
729 merge(reinterpret_cast<uint32_t*>(s), 16, 1, tileSize.x, tileSize.y, tileSize.z);
730 countActiveCells4(Nc, s, tileSizeClamped, tileSize);
731 for (uint32_t l = 0; l < lanes; l++) {
732 Oc[l] = g->cellOffsets[l].fetch_add(Nc[l]);
733 }
734 calculateOffsets4(g->cellMap, g->activeCellCases, g->activeCellIndices,
735 s,
736 tileSizeClamped, tileSize, rA, M,
737 Oc, layerStride, tOff, lanes);
738 }
739
740 g->scratchRelease(scratch);
741
742 if (g->elapsed_us != nullptr) {
743 g->elapsed_us->fetch_add(timer.elapsedMicroseconds());
744 }
745 _mm256_zeroupper();
746}
747
748#endif
Contains the Engine, Renderer, resource managers and other systems needed to run Cogs....