Cogs.Core
UniformGridSystem_sample_avx2.cpp
1
2#include <glm/glm.hpp>
3#include <glm/gtc/quaternion.hpp>
4
5#include <immintrin.h>
6
7//#include "C:\utils\iaca-win64\iacaMarks.h"
8
9namespace {
10
11 __forceinline void quat_times_vec3_ps(__m256& out_x, __m256& out_y, __m256& out_z,
12 const glm::quat& q,
13 const __m256& v_x, const __m256& v_y, const __m256& v_z)
14 {
15 __m256 q_x = _mm256_set1_ps(q.x);
16 __m256 q_y = _mm256_set1_ps(q.y);
17 __m256 q_z = _mm256_set1_ps(q.z);
18 __m256 q_w = _mm256_set1_ps(q.w);
19 __m256 uv_x = _mm256_fmsub_ps(q_y, v_z, _mm256_mul_ps(v_y, q_z));
20 __m256 uv_y = _mm256_fmsub_ps(q_z, v_x, _mm256_mul_ps(v_z, q_x));
21 __m256 uv_z = _mm256_fmsub_ps(q_x, v_y, _mm256_mul_ps(v_x, q_y));
22 __m256 uuv_x = _mm256_fmsub_ps(q_y, uv_z, _mm256_mul_ps(uv_y, q_z));
23 __m256 uuv_y = _mm256_fmsub_ps(q_z, uv_x, _mm256_mul_ps(uv_z, q_x));
24 __m256 uuv_z = _mm256_fmsub_ps(q_x, uv_y, _mm256_mul_ps(uv_x, q_y));
25 __m256 t_x = _mm256_fmadd_ps(q_w, uv_x, uuv_x);
26 __m256 t_y = _mm256_fmadd_ps(q_w, uv_y, uuv_y);
27 __m256 t_z = _mm256_fmadd_ps(q_w, uv_z, uuv_z);
28 out_x = _mm256_add_ps(v_x, _mm256_add_ps(t_x, t_x));
29 out_y = _mm256_add_ps(v_y, _mm256_add_ps(t_y, t_y));
30 out_z = _mm256_add_ps(v_z, _mm256_add_ps(t_z, t_z));
31 }
32
33 __forceinline __m256 atan_00155_ps(__m256 x)
34 {
35 static const float signBit = -0.f;
36 const auto c0 = _mm256_set1_ps(float(3.14159265358979323846264338327950288 / 4.0));
37 const auto c1 = _mm256_set1_ps(0.2447f);
38 const auto c2 = _mm256_set1_ps(0.0663f);
39 const auto c3 = _mm256_set1_ps(1.f);
40 __m256 sign = _mm256_set1_ps(-0.f);
41 __m256 abs_x = _mm256_andnot_ps(sign, x);
42 __m256 t1 = _mm256_sub_ps(abs_x, c3); // t1 = |x|-1
43 __m256 t3 = _mm256_fmadd_ps(abs_x, c2, c1); // t3 = abs_x*c2 + c1
44 __m256 t2 = _mm256_mul_ps(t1, t3);
45 __m256 t4 = _mm256_mul_ps(x, t2); // r4 = x*t1*t3
46 __m256 t5 = _mm256_fmsub_ps(c0, x, t4); // t5 = c0*x - t4
47 return t5;
48 }
49
50 __forceinline /*__declspec(noinline)*/ __m256 asin_ps(__m256 x)
51 {
52 const auto C0_ps = _mm256_set1_ps(1.5707288f);
53 const auto C1_ps = _mm256_set1_ps(-0.2121144f);
54 const auto C2_ps = _mm256_set1_ps(0.0742610f);
55 const auto C3_ps = _mm256_set1_ps(-0.0187293f);
56 const auto sign_ps = _mm256_set1_ps(-0.f);
57 const auto one_ps = _mm256_set1_ps(1.f);
58 const auto halfpi_ps = _mm256_set1_ps(1.5707963267948966f);
59 const auto special = _mm256_set1_ps(std::numeric_limits<float>::infinity()); // = 7F800000 (mask for exponent field)
60
61 __m256 abs_x = _mm256_andnot_ps(sign_ps, x);
62
63 __m256 a_ = _mm256_sub_ps(one_ps, abs_x);
64#if 0
65 __m256 a = _mm256_sqrt_ps(a_);
66#else
67 __m256 t = _mm256_rsqrt_ps(a_);
68 __m256 m = _mm256_cmp_ps(special, t, _CMP_NEQ_OQ); // Mask for non-infinity
69 __m256 a = _mm256_mul_ps(a_, _mm256_and_ps(m, t));
70#endif
71
72 __m256 b = C3_ps;
73 b = _mm256_fmadd_ps(b, abs_x, C2_ps);
74 b = _mm256_fmadd_ps(b, abs_x, C1_ps);
75 b = _mm256_fmadd_ps(b, abs_x, C0_ps);
76
77 __m256 rv = _mm256_fmsub_ps(a, b, halfpi_ps);
78 rv = _mm256_andnot_ps(sign_ps, rv);
79 rv = _mm256_or_ps(rv, _mm256_and_ps(x, sign_ps));
80 return rv;
81 }
82
83#if 0 //static unittest is no good on systems which does not have avx2 support, but extensions does not have proper unittest support
84#pragma optimize( "", off )
85 static struct UnitTests
86 {
87 UnitTests()
88 {
89 const unsigned N = 1000;
90
91 for (unsigned i = 0; i < N; i++) {
92 __m256 x, y;
93 x = _mm256_set1_ps((2.f / (N - 1))*i - 1.f);
94 y = atan_00155_ps(x);
95 auto e = std::abs(atan(x.m256_f32[7]) - y.m256_f32[7]);
96 assert(e < 0.00155f);
97 }
98
99 for (unsigned i = 0; i < N; i++) {
100 __m256 x, y;
101 x = _mm256_set1_ps((2.f / (N - 1))*i - 1.f);
102 y = asin_ps(x);
103 auto e = std::abs(std::asin(x.m256_f32[7]) - y.m256_f32[7]);
104 assert(e < 7e-4f);
105 }
106
107 int a = 2;
108 }
109 } unitTests;
110#pragma optimize( "", on )
111#endif
112
113}
114
115
116namespace Cogs::Core::EchoSounder {
117
118 // defined in UniformGridSystem.cpp
119 glm::vec3 getSamplePosRef(const glm::uvec3 dataSize,
120 const glm::vec3 tp,
121 const glm::vec3 scale,
122 const glm::vec3 arrayPositionGlobal,
123 const glm::vec4* frustum,
124 const float minDistanceSquared,
125 const float maxDistanceSquared,
126 const glm::quat inverseOrientation,
127 const uint32_t coordSys,
128 const glm::vec3 polarScale,
129 const glm::vec3 polarShift,
130 const uint32_t x,
131 const uint32_t y,
132 const uint32_t z);
133
134//#pragma optimize("", off)
135 void sampleTile3_border_avx2(float * data,
136 float& minVal,
137 float& maxVal,
138 const float *v,
139 const glm::vec3 /*tileIndex*/,
140 const glm::uvec3 /*tilePos*/,
141 const glm::uvec3 dataSize,
142 const glm::uvec3 maxIndices,
143 const glm::vec3 tp,
144 const glm::vec3 scale,
145 const glm::vec3 arrayPositionGlobal,
146 const glm::vec4* frustum,
147 const float minDistanceSquared,
148 const float maxDistanceSquared,
149 const glm::quat inverseOrientation,
150 const uint32_t coordSys,
151 const uint32_t minorCount,
152 const uint32_t sampleCount,
153 const glm::vec3 polarScale,
154 const glm::vec3 polarShift)
155 {
156 const __m128 rot_ = _mm_set_ps(inverseOrientation.w, inverseOrientation.z, inverseOrientation.y, inverseOrientation.x);
157 assert((dataSize.x & 7) == 0);
158 assert(coordSys == 1);
159
160 static const __m256 c01234567 = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7);
161 static const __m128 one_ps = _mm_setr_ps(1.f, 1.f, 1.f, 1.f);
162 glm::vec3 ban = arrayPositionGlobal - tp;
163
164 glm::uvec3 maxIndicesLL = glm::max(maxIndices, glm::uvec3(1u)) - glm::uvec3(1);
165
166 __m256 minValue = _mm256_set1_ps(std::numeric_limits<float>::max());
167 __m256 maxValue = _mm256_set1_ps(-std::numeric_limits<float>::max());
168 for (uint32_t z = 0; z < dataSize.z; z++) {
169 __m128 pz = _mm_mul_ss(_mm_set1_ps(scale.z), _mm_set1_ps(static_cast<float>(z)));
170 __m128 qz = _mm_sub_ss(pz, _mm_set1_ps(ban.z));
171 for (uint32_t y = 0; y < dataSize.y; y++) {
172 __m128 py = _mm_mul_ss(_mm_set1_ps(scale.y), _mm_set1_ps(static_cast<float>(y)));
173 __m128 qy = _mm_sub_ss(py, _mm_set1_ps(ban.y));
174 __m128 r2_yz_ = _mm_add_ss(_mm_mul_ss(qy, qy), _mm_mul_ss(qz, qz));
175 __m128 in0_dot_yz_ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[0].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[0].z), qz));
176 __m128 in1_dot_yz_ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[1].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[1].z), qz));
177 __m128 in2_dot_yz_ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[2].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[2].z), qz));
178 __m128 in3_dot_yz_ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[3].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[3].z), qz));
179
180 __m256 in0_dot_yz = _mm256_broadcastss_ps(in0_dot_yz_);
181 __m256 in1_dot_yz = _mm256_broadcastss_ps(in1_dot_yz_);
182 __m256 in2_dot_yz = _mm256_broadcastss_ps(in2_dot_yz_);
183 __m256 in3_dot_yz = _mm256_broadcastss_ps(in3_dot_yz_);
184 __m256 r2_yz = _mm256_broadcastss_ps(r2_yz_);
185
186 for (uint32_t x = 0; x < dataSize.x; x += 8) {
187 //IACA_VC64_START;
188
189 __m256 i = _mm256_add_ps(_mm256_set1_ps(static_cast<float>(x)), c01234567);
190 __m256 qx = _mm256_fmsub_ps(_mm256_set1_ps(scale.x), i, _mm256_set1_ps(ban.x));
191 __m256 r2 = _mm256_fmadd_ps(qx, qx, r2_yz);
192
193 // compare q against frustum planes
194 __m256 mask0 = _mm256_cmp_ps(_mm256_setzero_ps(), _mm256_fmadd_ps(_mm256_set1_ps(frustum[0].x), qx, in0_dot_yz), _CMP_LE_OQ);
195 __m256 mask1 = _mm256_cmp_ps(_mm256_setzero_ps(), _mm256_fmadd_ps(_mm256_set1_ps(frustum[1].x), qx, in1_dot_yz), _CMP_LE_OQ);
196 __m256 mask2 = _mm256_cmp_ps(_mm256_setzero_ps(), _mm256_fmadd_ps(_mm256_set1_ps(frustum[2].x), qx, in2_dot_yz), _CMP_LE_OQ);
197 __m256 mask3 = _mm256_cmp_ps(_mm256_setzero_ps(), _mm256_fmadd_ps(_mm256_set1_ps(frustum[3].x), qx, in3_dot_yz), _CMP_LE_OQ);
198 __m256 mask4 = _mm256_cmp_ps(_mm256_set1_ps(minDistanceSquared), r2, _CMP_LE_OQ);
199 __m256 mask5 = _mm256_cmp_ps(r2, _mm256_set1_ps(maxDistanceSquared), _CMP_LE_OQ);
200 __m256 mask = _mm256_and_ps(_mm256_and_ps(_mm256_and_ps(mask0, mask1),
201 _mm256_and_ps(mask2, mask3)),
202 _mm256_and_ps(mask4, mask5));
203 int movemask = _mm256_movemask_ps(mask);
204 if (movemask == 0) {
205#if 0
206 for (unsigned lane = 0; lane < 8; lane++) {
207 glm::vec3 ref = getSamplePosRef(dataSize, tp, scale, arrayPositionGlobal, frustum, minDistanceSquared, maxDistanceSquared,
208 inverseOrientation, coordSys, polarScale, polarShift,
209 x + lane, y, z);
210 assert(ref.x == -1);
211 }
212#endif
213 __m256 old = _mm256_load_ps(data);
214 minValue = _mm256_min_ps(minValue, old);
215 maxValue = _mm256_max_ps(maxValue, old);
216 data += 8;
217 continue;
218 }
219
220 __m256 ax, ay, az;
221 quat_times_vec3_ps(ax, ay, az,
222 inverseOrientation,
223 qx, _mm256_broadcastss_ps(qy), _mm256_broadcastss_ps(qz));
224
225 __m256 r_inv = _mm256_rsqrt_ps(r2);
226 //__m256 r = _mm256_rcp_ps(r_inv);
227 __m256 r = _mm256_mul_ps(r2, r_inv);
228
229 __m256 dirx = asin_ps(_mm256_mul_ps(ax, r_inv));
230 __m256 diry = atan_00155_ps(_mm256_mul_ps(ay, _mm256_rcp_ps(az)));
231 __m256 xi_i = _mm256_max_ps(_mm256_setzero_ps(), _mm256_mul_ps(_mm256_set1_ps(polarScale.x), _mm256_sub_ps(diry, _mm256_set1_ps(polarShift.x))));
232 __m256 xi_j = _mm256_max_ps(_mm256_setzero_ps(), _mm256_mul_ps(_mm256_set1_ps(polarScale.y), _mm256_sub_ps(dirx, _mm256_set1_ps(polarShift.y))));
233 __m256 xi_k = _mm256_max_ps(_mm256_setzero_ps(), _mm256_mul_ps(_mm256_set1_ps(polarScale.z), _mm256_sub_ps(az, _mm256_set1_ps(polarShift.z))));
234 //__m256 xi_k = _mm256_max_ps(_mm256_setzero_ps(), _mm256_mul_ps(_mm256_set1_ps(polarScale.z), _mm256_sub_ps(r, _mm256_set1_ps(polarShift.z))));
235
236#if 0
237 for (unsigned lane = 0; lane < 8; lane++) {
238 if (mask.m256_f32[lane] == 0.0) continue;
239 glm::vec3 ref = getSamplePosRef(dataSize, tp, scale, arrayPositionGlobal, frustum, minDistanceSquared, maxDistanceSquared,
240 inverseOrientation, coordSys, polarScale, polarShift,
241 x + lane, y, z);
242 if (ref.x < 0) continue;
243
244 auto ex = std::abs(ref.x - xi_i.m256_f32[lane]);
245 auto ey = std::abs(ref.y - xi_j.m256_f32[lane]);
246 auto ez = std::abs(ref.z - xi_k.m256_f32[lane]);
247 assert(ex < 1e-2f);
248 assert(ey < 1e-2f);
249 assert(ez < 1e-0f);
250 }
251
252#endif
253 __m256 tau_i = _mm256_floor_ps(xi_i);
254 __m256 tau_j = _mm256_floor_ps(xi_j);
255 __m256 tau_k = _mm256_floor_ps(xi_k);
256 __m256 t_i = _mm256_sub_ps(xi_i, tau_i);
257 __m256 t_j = _mm256_sub_ps(xi_j, tau_j);
258 __m256 t_k = _mm256_sub_ps(xi_k, tau_k);
259
260 __m256 i_i = _mm256_min_ps(_mm256_set1_ps(static_cast<float>(maxIndicesLL.x)), tau_i);
261 __m256 i_j = _mm256_min_ps(_mm256_set1_ps(static_cast<float>(maxIndicesLL.y)), tau_j);
262 __m256 i_k = _mm256_min_ps(_mm256_set1_ps(static_cast<float>(maxIndicesLL.z)), tau_k);
263
264 __m256 ix00_ = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(static_cast<float>(sampleCount)),
265 _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(static_cast<float>(minorCount)), i_j), i_i)), i_k);
266 __m256i ix00 = _mm256_cvtps_epi32(ix00_);
267
268 __m256i ix01 = _mm256_add_epi32(ix00, _mm256_set1_epi32(minorCount));
269 __m256i ix10 = _mm256_add_epi32(ix00, _mm256_set1_epi32(sampleCount));
270 __m256i ix11 = _mm256_add_epi32(ix00, _mm256_set1_epi32(minorCount + sampleCount));
271
272 __m256 val00 = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), v, ix00, mask, 4);
273 __m256 val01 = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), v, ix01, mask, 4);
274 __m256 val10 = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), v, ix10, mask, 4);
275 __m256 val11 = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), v, ix11, mask, 4);
276
277 __m256 dif0 = _mm256_sub_ps(val01, val00);
278 __m256 val0 = _mm256_fmadd_ps(t_j, dif0, val00);
279
280 __m256 dif1 = _mm256_sub_ps(val11, val10);
281 __m256 val1 = _mm256_fmadd_ps(t_j, dif1, val10);
282
283 __m256 dif = _mm256_sub_ps(val1, val0);
284 __m256 val_ = _mm256_fmadd_ps(t_i, dif, val0);
285
286 // 50-50-mix of old and new
287 __m256 old = _mm256_load_ps(data);
288
289 // replace inactive lanes with old data
290 val_ = _mm256_blendv_ps(old, val_, mask);
291
292 val_ = _mm256_fmadd_ps(_mm256_set1_ps(0.5f), old, _mm256_mul_ps(_mm256_set1_ps(0.5f), val_));
293
294 minValue = _mm256_min_ps(minValue, val_);
295 maxValue = _mm256_max_ps(maxValue, val_);
296
297 _mm256_store_ps(data, val_);
298
299 //_mm256_maskstore_ps(data, _mm256_castps_si256(mask), val_);
300 data += 8;
301
302 //IACA_VC64_END;
303 }
304 }
305 }
306
307 minVal = glm::min(glm::min(glm::min(minValue.m256_f32[0],
308 minValue.m256_f32[1]),
309 glm::min(minValue.m256_f32[2],
310 minValue.m256_f32[3])),
311 glm::min(glm::min(minValue.m256_f32[4],
312 minValue.m256_f32[5]),
313 glm::min(minValue.m256_f32[6],
314 minValue.m256_f32[7])));
315
316 maxVal = glm::max(glm::max(glm::max(maxValue.m256_f32[0],
317 maxValue.m256_f32[1]),
318 glm::max(maxValue.m256_f32[2],
319 maxValue.m256_f32[3])),
320 glm::max(glm::max(maxValue.m256_f32[4],
321 maxValue.m256_f32[5]),
322 glm::max(maxValue.m256_f32[6],
323 maxValue.m256_f32[7])));
324 }
325
326 //#pragma optimize( "", off )
327 void sampleTile3_avx2(float * data,
328 float& minVal,
329 float& maxVal,
330 const float *v,
331 const glm::vec3 /*tileIndex*/,
332 const glm::uvec3 /*tilePos*/,
333 const glm::uvec3 dataSize,
334 const glm::uvec3 maxIndices,
335 const glm::vec3 tp,
336 const glm::vec3 scale,
337 const glm::vec3 arrayPositionGlobal,
338 const glm::vec4* frustum,
339 const float /*minDistanceSquared*/,
340 const float /*maxDistanceSquared*/,
341 const glm::quat inverseOrientation,
342 const uint32_t coordSys,
343 const uint32_t minorCount,
344 const uint32_t sampleCount,
345 const glm::vec3 polarScale,
346 const glm::vec3 polarShift)
347 {
348 const __m128 rot_ = _mm_set_ps(inverseOrientation.w, inverseOrientation.z, inverseOrientation.y, inverseOrientation.x);
349 assert((dataSize.x & 7) == 0);
350 assert(coordSys == 1);
351
352 static const __m256 c01234567 = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7);
353 static const __m128 one_ps = _mm_setr_ps(1.f, 1.f, 1.f, 1.f);
354 glm::vec3 ban = arrayPositionGlobal - tp;
355
356 glm::uvec3 maxIndicesLL = glm::max(maxIndices, glm::uvec3(1u)) - glm::uvec3(1);
357
358 __m256 minValue = _mm256_set1_ps(std::numeric_limits<float>::max());
359 __m256 maxValue = _mm256_set1_ps(-std::numeric_limits<float>::max());
360 for (uint32_t z = 0; z < dataSize.z; z++) {
361 __m128 pz = _mm_mul_ss(_mm_set1_ps(scale.z), _mm_set1_ps(static_cast<float>(z)));
362 __m128 qz = _mm_sub_ss(pz, _mm_set1_ps(ban.z));
363 for (uint32_t y = 0; y < dataSize.y; y++) {
364 __m128 py = _mm_mul_ss(_mm_set1_ps(scale.y), _mm_set1_ps(static_cast<float>(y)));
365 __m128 qy = _mm_sub_ss(py, _mm_set1_ps(ban.y));
366 __m128 r2_yz_ = _mm_add_ss(_mm_mul_ss(qy, qy), _mm_mul_ss(qz, qz));
367 __m128 in0_dot_yz_ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[0].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[0].z), qz));
368 __m128 in1_dot_yz_ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[1].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[1].z), qz));
369 __m128 in2_dot_yz_ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[2].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[2].z), qz));
370 __m128 in3_dot_yz_ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[3].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[3].z), qz));
371
372 __m256 in0_dot_yz = _mm256_broadcastss_ps(in0_dot_yz_);
373 __m256 in1_dot_yz = _mm256_broadcastss_ps(in1_dot_yz_);
374 __m256 in2_dot_yz = _mm256_broadcastss_ps(in2_dot_yz_);
375 __m256 in3_dot_yz = _mm256_broadcastss_ps(in3_dot_yz_);
376 __m256 r2_yz = _mm256_broadcastss_ps(r2_yz_);
377
378 for (uint32_t x = 0; x < dataSize.x; x += 8) {
379 //IACA_VC64_START;
380
381 __m256 i = _mm256_add_ps(_mm256_set1_ps(static_cast<float>(x)), c01234567);
382 __m256 qx = _mm256_sub_ps(_mm256_mul_ps(_mm256_set1_ps(scale.x), i), _mm256_set1_ps(ban.x));
383 __m256 r2 = _mm256_add_ps(_mm256_mul_ps(qx, qx), r2_yz);
384
385 __m256 ax, ay, az;
386 quat_times_vec3_ps(ax, ay, az,
387 inverseOrientation,
388 qx, _mm256_broadcastss_ps(qy), _mm256_broadcastss_ps(qz));
389
390 __m256 r_inv = _mm256_rsqrt_ps(r2);
391 __m256 r = _mm256_rcp_ps(r_inv);
392
393 __m256 dirx = asin_ps(_mm256_mul_ps(ax, r_inv));
394 __m256 diry = atan_00155_ps(_mm256_mul_ps(ay, _mm256_rcp_ps(az)));
395 __m256 xi_i = _mm256_max_ps(_mm256_setzero_ps(), _mm256_mul_ps(_mm256_set1_ps(polarScale.x), _mm256_sub_ps(diry, _mm256_set1_ps(polarShift.x))));
396 __m256 xi_j = _mm256_max_ps(_mm256_setzero_ps(), _mm256_mul_ps(_mm256_set1_ps(polarScale.y), _mm256_sub_ps(dirx, _mm256_set1_ps(polarShift.y))));
397 //__m256 xi_k = _mm256_max_ps(_mm256_setzero_ps(), _mm256_mul_ps(_mm256_set1_ps(polarScale.z), _mm256_sub_ps(r, _mm256_set1_ps(polarShift.z))));
398 __m256 xi_k = _mm256_max_ps(_mm256_setzero_ps(), _mm256_mul_ps(_mm256_set1_ps(polarScale.z), _mm256_sub_ps(az, _mm256_set1_ps(polarShift.z))));
399
400#if 0
401 for (unsigned lane = 0; lane < 8; lane++) {
402 glm::vec3 ref = getSamplePosRef(dataSize, tp, scale, arrayPositionGlobal, frustum, minDistanceSquared, maxDistanceSquared,
403 inverseOrientation, coordSys, polarScale, polarShift,
404 x + lane, y, z);
405 auto ex = std::abs(ref.x - xi_i.m256_f32[lane]);
406 auto ey = std::abs(ref.y - xi_j.m256_f32[lane]);
407 auto ez = std::abs(ref.z - xi_k.m256_f32[lane]);
408 assert(ex < 1e-2f);
409 assert(ey < 1e-2f);
410 assert(ez < 1e-0f);
411 }
412
413#endif
414 __m256 tau_i = _mm256_floor_ps(xi_i);
415 __m256 tau_j = _mm256_floor_ps(xi_j);
416 __m256 tau_k = _mm256_floor_ps(xi_k);
417 __m256 t_i = _mm256_sub_ps(xi_i, tau_i);
418 __m256 t_j = _mm256_sub_ps(xi_j, tau_j);
419 __m256 t_k = _mm256_sub_ps(xi_k, tau_k);
420
421 __m256 i_i = _mm256_min_ps(_mm256_set1_ps(static_cast<float>(maxIndicesLL.x)), tau_i);
422 __m256 i_j = _mm256_min_ps(_mm256_set1_ps(static_cast<float>(maxIndicesLL.y)), tau_j);
423 __m256 i_k = _mm256_min_ps(_mm256_set1_ps(static_cast<float>(maxIndicesLL.z)), tau_k);
424
425 __m256 ix00_ = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(static_cast<float>(sampleCount)),
426 _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(static_cast<float>(minorCount)), i_j), i_i)), i_k);
427 __m256i ix00 = _mm256_cvtps_epi32(ix00_);
428
429 __m256i ix01 = _mm256_add_epi32(ix00, _mm256_set1_epi32(minorCount));
430 __m256i ix10 = _mm256_add_epi32(ix00, _mm256_set1_epi32(sampleCount));
431 __m256i ix11 = _mm256_add_epi32(ix00, _mm256_set1_epi32(minorCount + sampleCount));
432
433 __m256 val00 = _mm256_i32gather_ps(v, ix00, 4);
434 __m256 val01 = _mm256_i32gather_ps(v, ix01, 4);
435 __m256 val10 = _mm256_i32gather_ps(v, ix10, 4);
436 __m256 val11 = _mm256_i32gather_ps(v, ix11, 4);
437
438 __m256 dif0 = _mm256_sub_ps(val01, val00);
439 __m256 val0 = _mm256_fmadd_ps(t_j, dif0, val00);
440
441 __m256 dif1 = _mm256_sub_ps(val11, val10);
442 __m256 val1 = _mm256_fmadd_ps(t_j, dif1, val10);
443
444 __m256 dif = _mm256_sub_ps(val1, val0);
445 __m256 val_ = _mm256_fmadd_ps(t_i, dif, val0);
446
447 // 50-50-mix of old and new
448 __m256 old = _mm256_loadu_ps(data);
449 val_ = _mm256_fmadd_ps(_mm256_set1_ps(0.5f), old, _mm256_mul_ps(_mm256_set1_ps(0.5f), val_));
450
451 minValue = _mm256_min_ps(minValue, old);
452 maxValue = _mm256_max_ps(maxValue, old);
453
454 _mm256_store_ps(data, val_);
455 data += 8;
456
457 //IACA_VC64_END;
458 }
459 }
460 }
461
462 minVal = glm::min(glm::min(glm::min(minValue.m256_f32[0],
463 minValue.m256_f32[1]),
464 glm::min(minValue.m256_f32[2],
465 minValue.m256_f32[3])),
466 glm::min(glm::min(minValue.m256_f32[4],
467 minValue.m256_f32[5]),
468 glm::min(minValue.m256_f32[6],
469 minValue.m256_f32[7])));
470
471 maxVal = glm::max(glm::max(glm::max(maxValue.m256_f32[0],
472 maxValue.m256_f32[1]),
473 glm::max(maxValue.m256_f32[2],
474 maxValue.m256_f32[3])),
475 glm::max(glm::max(maxValue.m256_f32[4],
476 maxValue.m256_f32[5]),
477 glm::max(maxValue.m256_f32[6],
478 maxValue.m256_f32[7])));
479 _mm256_zeroupper();
480 }
481 //#pragma optimize( "", on )
482
483}
484