2#include <glm/gtc/quaternion.hpp>
8 template<
int lane>
inline __m128 broadcast_ps(__m128 x) {
9 return _mm_shuffle_ps(x, x, _MM_SHUFFLE(lane, lane, lane, lane));
12 __forceinline
void quat_times_vec3_ps(__m128& out_x, __m128& out_y, __m128& out_z,
14 const __m128& v_x,
const __m128& v_y,
const __m128& v_z)
19 rot.x = q.m128_f32[0];
20 rot.y = q.m128_f32[1];
21 rot.z = q.m128_f32[2];
22 rot.w = q.m128_f32[3];
23 for (
int i = 0; i < 4; i++) {
25 v.x = v_x.m128_f32[i];
26 v.y = v_y.m128_f32[i];
27 v.z = v_z.m128_f32[i];
29 glm::vec3 w = rot * v;
30 out_x.m128_f32[i] = w.x;
31 out_y.m128_f32[i] = w.y;
32 out_z.m128_f32[i] = w.z;
37 __m128 q_x = _mm_shuffle_ps(q, q, _MM_SHUFFLE(0, 0, 0, 0));
38 __m128 q_y = _mm_shuffle_ps(q, q, _MM_SHUFFLE(1, 1, 1, 1));
39 __m128 q_z = _mm_shuffle_ps(q, q, _MM_SHUFFLE(2, 2, 2, 2));
40 __m128 q_w = _mm_shuffle_ps(q, q, _MM_SHUFFLE(3, 3, 3, 3));
42 __m128 uv_x = _mm_sub_ps(_mm_mul_ps(q_y, v_z), _mm_mul_ps(v_y, q_z));
43 __m128 uv_y = _mm_sub_ps(_mm_mul_ps(q_z, v_x), _mm_mul_ps(v_z, q_x));
44 __m128 uv_z = _mm_sub_ps(_mm_mul_ps(q_x, v_y), _mm_mul_ps(v_x, q_y));
45 __m128 uuv_x = _mm_sub_ps(_mm_mul_ps(q_y, uv_z), _mm_mul_ps(uv_y, q_z));
46 __m128 uuv_y = _mm_sub_ps(_mm_mul_ps(q_z, uv_x), _mm_mul_ps(uv_z, q_x));
47 __m128 uuv_z = _mm_sub_ps(_mm_mul_ps(q_x, uv_y), _mm_mul_ps(uv_x, q_y));
48 __m128 t_x = _mm_add_ps(_mm_mul_ps(q_w, uv_x), uuv_x);
49 __m128 t_y = _mm_add_ps(_mm_mul_ps(q_w, uv_y), uuv_y);
50 __m128 t_z = _mm_add_ps(_mm_mul_ps(q_w, uv_z), uuv_z);
51 out_x = _mm_add_ps(v_x, _mm_add_ps(t_x, t_x));
52 out_y = _mm_add_ps(v_y, _mm_add_ps(t_y, t_y));
53 out_z = _mm_add_ps(v_z, _mm_add_ps(t_z, t_z));
56 __forceinline __m128 atan_00155_ps(__m128 x)
59 static const float signBit = -0.f;
65 static const __m128 C = _mm_setr_ps(
float(3.14159265358979323846264338327950288 / 4.0),
69 const auto c0 = _mm_shuffle_ps(C, C, _MM_SHUFFLE(0, 0, 0, 0));
70 const auto c1 = _mm_shuffle_ps(C, C, _MM_SHUFFLE(1, 1, 1, 1));
71 const auto c2 = _mm_shuffle_ps(C, C, _MM_SHUFFLE(2, 2, 2, 2));
72 const auto c3 = _mm_shuffle_ps(C, C, _MM_SHUFFLE(3, 3, 3, 3));
74 __m128 sign = _mm_load1_ps(&signBit);
75 __m128 abs_x = _mm_andnot_ps(sign, x);
77 __m128 t0 = _mm_mul_ps(c0, x);
78 __m128 t1 = _mm_sub_ps(abs_x, c3);
79 __m128 t2 = _mm_mul_ps(abs_x, c2);
80 __m128 t3 = _mm_add_ps(c1, t2);
81 __m128 t4 = _mm_mul_ps(x, _mm_mul_ps(t1, t3));
82 __m128 t5 = _mm_sub_ps(t0, t4);
88 __forceinline __m128 asin_ps(__m128 x)
90 static const float signBit = -0.f;
91 static const float asin_deg3_C3 = -0.0187293f;
92 static const float asin_deg3_C2 = 0.0742610f;
93 static const float asin_deg3_C1 = -0.2121144f;
94 static const float asin_deg3_C0 = 1.5707288f;
95 static const __m128 asin_deg3_C = _mm_set_ps(asin_deg3_C3, asin_deg3_C2, asin_deg3_C1, asin_deg3_C0);
96 static const float one = 1.f;
97 static const float piTwo = 1.5707963267948966f;
102 for (
int i = 0; i < 4; i++) {
103 rv.m128_f32[i] = std::asin(x.m128_f32[i]);
108 __m128 sign = _mm_load1_ps(&signBit);
109 __m128 abs_x = _mm_andnot_ps(sign, x);
113 __m128 C = _mm_load_ps((
float*)(&asin_deg3_C));
114 __m128 r = _mm_mul_ps(_mm_shuffle_ps(C, C, _MM_SHUFFLE(3, 3, 3, 3)), abs_x);
115 r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(C, C, _MM_SHUFFLE(2, 2, 2, 2)), abs_x));
116 r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(C, C, _MM_SHUFFLE(1, 1, 1, 1)), abs_x));
117 r = _mm_add_ps(r, _mm_shuffle_ps(C, C, _MM_SHUFFLE(0, 0, 0, 0)));
120 __m128 r = _mm_mul_ps(_mm_load1_ps(&asin_deg7_C7), abs_x);
121 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C6), abs_x));
122 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C5), abs_x));
123 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C4), abs_x));
124 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C3), abs_x));
125 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C2), abs_x));
126 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C1), abs_x));
127 r = _mm_add_ps(r, _mm_load1_ps(&asin_deg7_C0));
130 __m128 q = _mm_sub_ps(_mm_load1_ps(&one), abs_x);
134 q = _mm_rcp_ps(_mm_rsqrt_ps(q));
137 r = _mm_sub_ps(_mm_load1_ps(&piTwo), _mm_mul_ps(q, r));
140 return _mm_or_ps(r, _mm_and_ps(x, sign));
144 __forceinline __m128 asin_v2_ps(__m128 x)
146 const __m128 sign_ps = _mm_set1_ps(-0.f);
147 const __m128 one_ps = _mm_set1_ps(1.f);
148 const __m128 halfpi_ps = _mm_set1_ps(1.5707963267948966f);
149 const __m128 C3_ps = _mm_set1_ps(-0.0187293f);
150 const __m128 C2_ps = _mm_set1_ps(0.0742610f);
151 const __m128 C1_ps = _mm_set1_ps(-0.2121144f);
152 const __m128 C0_ps = _mm_set1_ps(1.5707288f);
154 __m128 abs_x = _mm_andnot_ps(sign_ps, x);
156 __m128 a = _mm_sub_ps(one_ps, abs_x);
160 b = _mm_add_ps(_mm_mul_ps(b, abs_x), C2_ps);
161 b = _mm_add_ps(_mm_mul_ps(b, abs_x), C1_ps);
162 b = _mm_add_ps(_mm_mul_ps(b, abs_x), C0_ps);
164 __m128 rv = _mm_sub_ps(halfpi_ps, _mm_mul_ps(a, b));
166 rv = _mm_or_ps(rv, _mm_and_ps(x, sign_ps));
172#pragma optimize( "", off )
173 static struct UnitTests
177 const unsigned N = 1000;
178 for (
unsigned i = 0; i < N; i++) {
180 x.m128_f32[0] = (2.f / (N - 1))*i - 1.f;
181 y = atan_00155_ps(x);
182 auto e = std::abs(atan(x.m128_f32[0]) - y.m128_f32[0]);
183 assert(e < 0.00155f);
186 for (
unsigned i = 0; i < N; i++) {
188 x.m128_f32[0] = (2.f / (N - 1))*i - 1.f;
190 auto e = std::abs(std::asin(x.m128_f32[0]) - y.m128_f32[0]);
195#pragma optimize( "", on )
200namespace Cogs::Core::EchoSounder {
202 void sampleTile_border_sse41(
float * data,
206 const glm::uvec3 dataSize,
207 const glm::uvec3 maxIndices,
209 const glm::vec3 scale,
210 const glm::vec3 arrayPositionGlobal,
211 const glm::vec4* frustum,
212 const float minDistanceSquared,
213 const float maxDistanceSquared,
214 const glm::quat inverseOrientation,
215 const uint32_t coordSys,
216 const uint32_t minorCount,
217 const uint32_t sampleCount,
218 const glm::vec3 polarScale,
219 const glm::vec3 polarShift)
221 const __m128 rot_ = _mm_set_ps(inverseOrientation.w, inverseOrientation.z, inverseOrientation.y, inverseOrientation.x);
222 assert((dataSize.x & 3) == 0);
223 assert(coordSys == 1);
225 glm::vec3 ban = arrayPositionGlobal - tp;
227 for (uint32_t z = 0; z < dataSize.z; z++) {
228 __m128 pz = _mm_mul_ss(_mm_set1_ps(scale.z), _mm_set1_ps((
float)z));
229 __m128 qz = _mm_sub_ss(pz, _mm_set1_ps(ban.z));
230 for (uint32_t y = 0; y < dataSize.y; y++) {
231 __m128 py = _mm_mul_ss(_mm_set1_ps(scale.y), _mm_set1_ps((
float)y));
232 __m128 qy = _mm_sub_ss(py, _mm_set1_ps(ban.y));
233 __m128 r2_yz = _mm_add_ss(_mm_mul_ss(qy, qy), _mm_mul_ss(qz, qz));
234 __m128 in0_dot_yz = _mm_add_ss(_mm_mul_ss(_mm_set1_ps(frustum[0].y), qy), _mm_mul_ss(_mm_set1_ps(frustum[0].z), qz));
235 __m128 in1_dot_yz = _mm_add_ss(_mm_mul_ss(_mm_set1_ps(frustum[1].y), qy), _mm_mul_ss(_mm_set1_ps(frustum[1].z), qz));
236 __m128 in2_dot_yz = _mm_add_ss(_mm_mul_ss(_mm_set1_ps(frustum[2].y), qy), _mm_mul_ss(_mm_set1_ps(frustum[2].z), qz));
237 __m128 in3_dot_yz = _mm_add_ss(_mm_mul_ss(_mm_set1_ps(frustum[3].y), qy), _mm_mul_ss(_mm_set1_ps(frustum[3].z), qz));
239 in0_dot_yz = _mm_shuffle_ps(in0_dot_yz, in0_dot_yz, _MM_SHUFFLE(0, 0, 0, 0));
240 in1_dot_yz = _mm_shuffle_ps(in1_dot_yz, in1_dot_yz, _MM_SHUFFLE(0, 0, 0, 0));
241 in2_dot_yz = _mm_shuffle_ps(in2_dot_yz, in2_dot_yz, _MM_SHUFFLE(0, 0, 0, 0));
242 in3_dot_yz = _mm_shuffle_ps(in3_dot_yz, in3_dot_yz, _MM_SHUFFLE(0, 0, 0, 0));
243 r2_yz = _mm_shuffle_ps(r2_yz, r2_yz, _MM_SHUFFLE(0, 0, 0, 0));
245 for (uint32_t x = 0; x < dataSize.x; x += 4) {
248 __m128 i = _mm_setr_ps((
float)(x + 0), (
float)(x + 1), (
float)(x + 2), (
float)(x + 3));
250 __m128 qx = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(scale.x), i), _mm_set1_ps(ban.x));
253 __m128 r2 = _mm_add_ps(_mm_mul_ps(qx, qx), r2_yz);
256 __m128 mask0 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[0].x), qx), in0_dot_yz));
257 __m128 mask1 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[1].x), qx), in1_dot_yz));
258 __m128 mask2 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[2].x), qx), in2_dot_yz));
259 __m128 mask3 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[3].x), qx), in3_dot_yz));
260 __m128 mask4 = _mm_cmple_ps(_mm_set1_ps(minDistanceSquared), r2);
261 __m128 mask5 = _mm_cmple_ps(r2, _mm_set1_ps(maxDistanceSquared));
262 __m128 mask = _mm_and_ps(_mm_and_ps(_mm_and_ps(mask0, mask1),
263 _mm_and_ps(mask2, mask3)),
264 _mm_and_ps(mask4, mask5));
265 int movemask = _mm_movemask_ps(mask);
271 quat_times_vec3_ps(ax, ay, az,
273 qx, broadcast_ps<0>(qy), broadcast_ps<0>(qz));
275 __m128 r_inv = _mm_rsqrt_ps(r2);
276 __m128 r = _mm_rcp_ps(r_inv);
280 __m128 dirx = asin_ps(_mm_mul_ps(ax, r_inv));
281 __m128 diry = atan_00155_ps(_mm_mul_ps(ay, _mm_rcp_ps(az)));
284 __m128 xi_i = _mm_max_ps(_mm_setzero_ps(), _mm_mul_ps(_mm_set1_ps(polarScale.x), _mm_sub_ps(diry, _mm_set1_ps(polarShift.x))));
285 __m128 xi_j = _mm_max_ps(_mm_setzero_ps(), _mm_mul_ps(_mm_set1_ps(polarScale.y), _mm_sub_ps(dirx, _mm_set1_ps(polarShift.y))));
286 __m128 xi_k = _mm_max_ps(_mm_setzero_ps(), _mm_mul_ps(_mm_set1_ps(polarScale.z), _mm_sub_ps(r, _mm_set1_ps(polarShift.z))));
287 __m128 tau_i = _mm_floor_ps(xi_i);
288 __m128 tau_j = _mm_floor_ps(xi_j);
289 __m128 tau_k = _mm_floor_ps(xi_k);
290 __m128 t_i = _mm_sub_ps(xi_i, tau_i);
291 __m128 t_j = _mm_sub_ps(xi_j, tau_j);
292 __m128 t_k = _mm_sub_ps(xi_k, tau_k);
294 __m128 i_i = (_mm_min_ps(_mm_set1_ps((
float)maxIndices.x), tau_i));
295 __m128 i_j = (_mm_min_ps(_mm_set1_ps((
float)maxIndices.y), tau_j));
296 __m128 i_k = (_mm_min_ps(_mm_set1_ps((
float)maxIndices.z), tau_k));
297 __m128 j_i = (_mm_min_ps(_mm_set1_ps((
float)maxIndices.x), _mm_add_ps(tau_i, _mm_set1_ps(1.f))));
298 __m128 j_j = (_mm_min_ps(_mm_set1_ps((
float)maxIndices.y), _mm_add_ps(tau_j, _mm_set1_ps(1.f))));
300 for (
unsigned lane = 0; lane < 4; lane++) {
301 if (((movemask >> lane) & 1) == 0)
continue;
304 glm::vec3 a(ax.m128_f32[lane],
308 float r = std::sqrt(r2.m128_f32[lane]);
310 float dirX = std::asin(a.x / r);
311 float dirY = std::atan(a.y / a.z);
312 auto polarPos = glm::vec3(dirY, dirX, r);
315 glm::vec3 xi = glm::max(glm::vec3(0.f), polarScale*(polarPos - polarShift));
318 glm::vec3 tau = glm::floor(xi);
320 glm::vec3 t = xi - tau;
321 glm::uvec3 i = glm::min(maxIndices, glm::uvec3(tau));
322 glm::uvec2 j = glm::min(glm::uvec2(maxIndices), glm::uvec2(i) + glm::uvec2(1));
324 float val00 = v[(i.y*minorCount + i.x)*sampleCount + i.z];
325 float val01 = v[(j.y*minorCount + i.x)*sampleCount + i.z];
326 float val0 = (1.f - t.y)*val00 + t.y*val01;
328 float val10 = v[(i.y*minorCount + j.x)*sampleCount + i.z];
329 float val11 = v[(j.y*minorCount + j.x)*sampleCount + i.z];
330 float val1 = (1.f - t.y)*val10 + t.y*val11;
332 float val = (1.f - t.x)*val0 + t.x*val1;
333 uint32_t index = z * dataSize.y*dataSize.x + y * dataSize.x + x + lane;
337 glm::vec3 a(ax.m128_f32[lane],
341 float r = std::sqrt(r2.m128_f32[lane]);
343 float dirX = dirx.m128_f32[lane];
344 float dirY = diry.m128_f32[lane];
345 auto polarPos = glm::vec3(dirY, dirX, r);
348 glm::vec3 xi = glm::max(glm::vec3(0.f), polarScale*(polarPos - polarShift));
351 glm::vec3 tau = glm::floor(xi);
353 glm::vec3 t = xi - tau;
354 glm::uvec3 i = glm::min(maxIndices, glm::uvec3(tau));
355 glm::uvec2 j = glm::min(glm::uvec2(maxIndices), glm::uvec2(i) + glm::uvec2(1));
357 float val00 = v[(i.y*minorCount + i.x)*sampleCount + i.z];
358 float val01 = v[(j.y*minorCount + i.x)*sampleCount + i.z];
359 float val0 = (1.f - t.y)*val00 + t.y*val01;
361 float val10 = v[(i.y*minorCount + j.x)*sampleCount + i.z];
362 float val11 = v[(j.y*minorCount + j.x)*sampleCount + i.z];
363 float val1 = (1.f - t.y)*val10 + t.y*val11;
365 float val = (1.f - t.x)*val0 + t.x*val1;
366 uint32_t index = z * dataSize.y*dataSize.x + y * dataSize.x + x + lane;
369 glm::vec3 a(ax.m128_f32[lane],
373 float r_ = r.m128_f32[lane];
375 float dirX = dirx.m128_f32[lane];
376 float dirY = diry.m128_f32[lane];
377 auto polarPos = glm::vec3(dirY, dirX, r_);
380 glm::vec3 xi = glm::max(glm::vec3(0.f), polarScale*(polarPos - polarShift));
381 xi.x = xi_i.m128_f32[lane];
382 xi.y = xi_j.m128_f32[lane];
383 xi.z = xi_k.m128_f32[lane];
386 glm::vec3 tau = glm::floor(xi);
391 glm::vec3 t = xi - tau;
392 glm::uvec3 i = glm::min(maxIndices, glm::uvec3(tau));
393 glm::uvec2 j = glm::min(glm::uvec2(maxIndices), glm::uvec2(i) + glm::uvec2(1));
395 float val00 = v[(i.y*minorCount + i.x)*sampleCount + i.z];
396 float val01 = v[(j.y*minorCount + i.x)*sampleCount + i.z];
397 float val0 = (1.f - t.y)*val00 + t.y*val01;
399 float val10 = v[(i.y*minorCount + j.x)*sampleCount + i.z];
400 float val11 = v[(j.y*minorCount + j.x)*sampleCount + i.z];
401 float val1 = (1.f - t.y)*val10 + t.y*val11;
403 float val = (1.f - t.x)*val0 + t.x*val1;
404 uint32_t index = z * dataSize.y*dataSize.x + y * dataSize.x + x + lane;
411 tau.x = tau_i.m128_f32[lane];
412 tau.y = tau_j.m128_f32[lane];
413 tau.z = tau_k.m128_f32[lane];
416 t.x = t_i.m128_f32[lane];
417 t.x = t_j.m128_f32[lane];
418 t.x = t_k.m128_f32[lane];
421 ii.x =
unsigned(i_i.m128_f32[lane]);
422 ii.y = unsigned(i_j.m128_f32[lane]);
423 ii.z = unsigned(i_k.m128_f32[lane]);
426 j.x = unsigned(j_i.m128_f32[lane]);
427 j.y = unsigned(j_j.m128_f32[lane]);
431 float val00 = v[(ii.y*minorCount + ii.x)*sampleCount + ii.z];
432 float val01 = v[(j.y*minorCount + ii.x)*sampleCount + ii.z];
433 float val0 = (1.f - t.y)*val00 + t.y*val01;
435 float val10 = v[(ii.y*minorCount + j.x)*sampleCount + ii.z];
436 float val11 = v[(j.y*minorCount + j.x)*sampleCount + ii.z];
437 float val1 = (1.f - t.y)*val10 + t.y*val11;
439 float val = (1.f - t.x)*val0 + t.x*val1;
441 uint32_t index = z * dataSize.y*dataSize.x + y * dataSize.x + x + lane;
454 void sampleTile_inner_sse41(
float * data,
458 const glm::uvec3 dataSize,
459 const glm::uvec3 maxIndices,
461 const glm::vec3 scale,
462 const glm::vec3 arrayPositionGlobal,
463 const glm::vec4* frustum,
464 const float minDistanceSquared,
465 const float maxDistanceSquared,
466 const glm::quat inverseOrientation,
467 const uint32_t coordSys,
468 const uint32_t minorCount,
469 const uint32_t sampleCount,
470 const glm::vec3 polarScale,
471 const glm::vec3 polarShift)
473 const __m128 rot_ = _mm_set_ps(inverseOrientation.w, inverseOrientation.z, inverseOrientation.y, inverseOrientation.x);
474 assert((dataSize.x & 3) == 0);
475 assert(coordSys == 1);
477 static const __m128 c0123 = _mm_setr_ps(0, 1, 2, 3);
478 static const __m128 one_ps = _mm_setr_ps(1.f, 1.f, 1.f, 1.f);
479 glm::vec3 ban = arrayPositionGlobal - tp;
481 for (uint32_t z = 0; z < dataSize.z; z++) {
482 __m128 pz = _mm_mul_ss(_mm_set1_ps(scale.z), _mm_set1_ps((
float)z));
483 __m128 qz = _mm_sub_ss(pz, _mm_set1_ps(ban.z));
484 for (uint32_t y = 0; y < dataSize.y; y++) {
485 __m128 py = _mm_mul_ss(_mm_set1_ps(scale.y), _mm_set1_ps((
float)y));
486 __m128 qy = _mm_sub_ss(py, _mm_set1_ps(ban.y));
487 __m128 r2_yz = _mm_add_ss(_mm_mul_ss(qy, qy), _mm_mul_ss(qz, qz));
488 __m128 in0_dot_yz = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[0].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[0].z), qz));
489 __m128 in1_dot_yz = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[1].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[1].z), qz));
490 __m128 in2_dot_yz = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[2].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[2].z), qz));
491 __m128 in3_dot_yz = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[3].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[3].z), qz));
493 in0_dot_yz = _mm_shuffle_ps(in0_dot_yz, in0_dot_yz, _MM_SHUFFLE(0, 0, 0, 0));
494 in1_dot_yz = _mm_shuffle_ps(in1_dot_yz, in1_dot_yz, _MM_SHUFFLE(0, 0, 0, 0));
495 in2_dot_yz = _mm_shuffle_ps(in2_dot_yz, in2_dot_yz, _MM_SHUFFLE(0, 0, 0, 0));
496 in3_dot_yz = _mm_shuffle_ps(in3_dot_yz, in3_dot_yz, _MM_SHUFFLE(0, 0, 0, 0));
497 r2_yz = _mm_shuffle_ps(r2_yz, r2_yz, _MM_SHUFFLE(0, 0, 0, 0));
499 for (uint32_t x = 0; x < dataSize.x; x += 4) {
502 __m128 i = _mm_add_ps(_mm_set1_ps((
float)x), c0123);
503 __m128 qx = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(scale.x), i), _mm_set1_ps(ban.x));
506 __m128 r2 = _mm_add_ps(_mm_mul_ps(qx, qx), r2_yz);
509 __m128 gmask0 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[0].x), qx), in0_dot_yz);
510 __m128 gmask1 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[1].x), qx), in1_dot_yz);
511 __m128 gmask2 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[2].x), qx), in2_dot_yz);
512 __m128 gmask3 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[3].x), qx), in3_dot_yz);
515 __m128 mask0 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[0].x), qx), in0_dot_yz));
516 __m128 mask1 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[1].x), qx), in1_dot_yz));
517 __m128 mask2 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[2].x), qx), in2_dot_yz));
518 __m128 mask3 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[3].x), qx), in3_dot_yz));
519 __m128 mask4 = _mm_cmple_ps(_mm_set1_ps(minDistanceSquared), r2);
520 __m128 mask5 = _mm_cmple_ps(r2, _mm_set1_ps(maxDistanceSquared));
521 __m128 mask = _mm_and_ps(_mm_and_ps(_mm_and_ps(mask0, mask1),
522 _mm_and_ps(mask2, mask3)),
523 _mm_and_ps(mask4, mask5));
531 quat_times_vec3_ps(ax, ay, az,
533 qx, broadcast_ps<0>(qy), broadcast_ps<0>(qz));
535 __m128 r_inv = _mm_rsqrt_ps(r2);
536 __m128 r = _mm_rcp_ps(r_inv);
540 __m128 dirx = asin_ps(_mm_mul_ps(ax, r_inv));
541 __m128 diry = atan_00155_ps(_mm_mul_ps(ay, _mm_rcp_ps(az)));
544 __m128 xi_i = _mm_max_ps(_mm_setzero_ps(), _mm_mul_ps(_mm_set1_ps(polarScale.x), _mm_sub_ps(diry, _mm_set1_ps(polarShift.x))));
545 __m128 xi_j = _mm_max_ps(_mm_setzero_ps(), _mm_mul_ps(_mm_set1_ps(polarScale.y), _mm_sub_ps(dirx, _mm_set1_ps(polarShift.y))));
546 __m128 xi_k = _mm_max_ps(_mm_setzero_ps(), _mm_mul_ps(_mm_set1_ps(polarScale.z), _mm_sub_ps(r, _mm_set1_ps(polarShift.z))));
547 __m128 tau_i = _mm_floor_ps(xi_i);
548 __m128 tau_j = _mm_floor_ps(xi_j);
549 __m128 tau_k = _mm_floor_ps(xi_k);
550 __m128 t_i = _mm_sub_ps(xi_i, tau_i);
551 __m128 t_j = _mm_sub_ps(xi_j, tau_j);
552 __m128 t_k = _mm_sub_ps(xi_k, tau_k);
554 __m128 i_i = (_mm_min_ps(_mm_set1_ps((
float)maxIndices.x), tau_i));
555 __m128 i_j = (_mm_min_ps(_mm_set1_ps((
float)maxIndices.y), tau_j));
556 __m128 i_k = (_mm_min_ps(_mm_set1_ps((
float)maxIndices.z), tau_k));
557 __m128 j_i = (_mm_min_ps(_mm_set1_ps((
float)maxIndices.x), _mm_add_ps(tau_i, _mm_set1_ps(1.f))));
558 __m128 j_j = (_mm_min_ps(_mm_set1_ps((
float)maxIndices.y), _mm_add_ps(tau_j, _mm_set1_ps(1.f))));
561 __m128i i_i_ = _mm_cvtps_epi32(i_i);
562 __m128i i_j_ = _mm_cvtps_epi32(i_j);
563 __m128i i_k_ = _mm_cvtps_epi32(i_k);
565 __m128i j_i_ = _mm_cvtps_epi32(j_i);
566 __m128i j_j_ = _mm_cvtps_epi32(j_j);
568 __m128i ix00 = _mm_add_epi32(_mm_mullo_epi32(_mm_set1_epi32(sampleCount), _mm_add_epi32(_mm_mullo_epi32(_mm_set1_epi32(minorCount), i_j_), i_i_)), i_k_);
569 __m128 val00 = _mm_setr_ps(v[ix00.m128i_u32[0]],
570 v[ix00.m128i_u32[1]],
571 v[ix00.m128i_u32[2]],
572 v[ix00.m128i_u32[3]]);
574 __m128i ix01 = _mm_add_epi32(_mm_mullo_epi32(_mm_set1_epi32(sampleCount), _mm_add_epi32(_mm_mullo_epi32(_mm_set1_epi32(minorCount), j_j_), i_i_)), i_k_);
575 __m128 val01 = _mm_setr_ps(v[ix01.m128i_u32[0]],
576 v[ix01.m128i_u32[1]],
577 v[ix01.m128i_u32[2]],
578 v[ix01.m128i_u32[3]]);
579 __m128 val0 = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(one_ps, t_j), val00), _mm_mul_ps(t_j, val01));
582 __m128i ix10 = _mm_add_epi32(_mm_mullo_epi32(_mm_set1_epi32(sampleCount), _mm_add_epi32(_mm_mullo_epi32(_mm_set1_epi32(minorCount), i_j_), j_i_)), i_k_);
583 __m128 val10 = _mm_setr_ps(v[ix10.m128i_u32[0]],
584 v[ix10.m128i_u32[1]],
585 v[ix10.m128i_u32[2]],
586 v[ix10.m128i_u32[3]]);
588 __m128i ix11 = _mm_add_epi32(_mm_mullo_epi32(_mm_set1_epi32(sampleCount), _mm_add_epi32(_mm_mullo_epi32(_mm_set1_epi32(minorCount), j_j_), j_i_)), i_k_);
589 __m128 val11 = _mm_setr_ps(v[ix01.m128i_u32[0]],
590 v[ix01.m128i_u32[1]],
591 v[ix01.m128i_u32[2]],
592 v[ix01.m128i_u32[3]]);
594 __m128 val1 = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(one_ps, t_j), val10), _mm_mul_ps(t_j, val11));
596 __m128 val_ = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(one_ps, t_i), val0), _mm_mul_ps(t_i, val1));
598 uint32_t index = z * dataSize.y*dataSize.x + y * dataSize.x + x;
599 _mm_store_ps(data + index, val_);
603 for (
unsigned lane = 0; lane < 4; lane++) {
609 glm::vec3 a(ax.m128_f32[lane],
613 float r = std::sqrt(r2.m128_f32[lane]);
615 float dirX = std::asin(a.x / r);
616 float dirY = std::atan(a.y / a.z);
617 auto polarPos = glm::vec3(dirY, dirX, r);
620 glm::vec3 xi = glm::max(glm::vec3(0.f), polarScale*(polarPos - polarShift));
623 glm::vec3 tau = glm::floor(xi);
625 glm::vec3 t = xi - tau;
626 glm::uvec3 i = glm::min(maxIndices, glm::uvec3(tau));
627 glm::uvec2 j = glm::min(glm::uvec2(maxIndices), glm::uvec2(i) + glm::uvec2(1));
629 float val00 = v[(i.y*minorCount + i.x)*sampleCount + i.z];
630 float val01 = v[(j.y*minorCount + i.x)*sampleCount + i.z];
631 float val0 = (1.f - t.y)*val00 + t.y*val01;
633 float val10 = v[(i.y*minorCount + j.x)*sampleCount + i.z];
634 float val11 = v[(j.y*minorCount + j.x)*sampleCount + i.z];
635 float val1 = (1.f - t.y)*val10 + t.y*val11;
637 float val = (1.f - t.x)*val0 + t.x*val1;
638 uint32_t index = z * dataSize.y*dataSize.x + y * dataSize.x + x + lane;
643 tau.x = tau_i.m128_f32[lane];
644 tau.y = tau_j.m128_f32[lane];
645 tau.z = tau_k.m128_f32[lane];
648 t.x = t_i.m128_f32[lane];
649 t.x = t_j.m128_f32[lane];
650 t.x = t_k.m128_f32[lane];
653 i.x =
unsigned(i_i.m128_f32[lane]);
654 i.y = unsigned(i_j.m128_f32[lane]);
655 i.z = unsigned(i_k.m128_f32[lane]);
658 j.x = unsigned(j_i.m128_f32[lane]);
659 j.y = unsigned(j_j.m128_f32[lane]);
662 float val00 = v[ix00.m128i_u32[lane]];
663 float val01 = v[ix01.m128i_u32[lane]];
666 float val0 = (1.f - t.y)*val00 + t.y*val01;
668 float val10 = v[ix10.m128i_u32[lane]];
669 float val11 = v[ix11.m128i_u32[lane]];
672 float val1 = (1.f - t.y)*val10 + t.y*val11;
674 float val = (1.f - t.x)*val0 + t.x*val1;
676 val = val_.m128_f32[lane];
678 uint32_t index = z * dataSize.y*dataSize.x + y * dataSize.x + x + lane;
688#pragma optimize( "", on )