Cogs.Core
UniformGridSystem_sample_sse41.cpp
1#include <glm/glm.hpp>
2#include <glm/gtc/quaternion.hpp>
3
4//#include "C:\utils\iaca-win64\iacaMarks.h"
5
6namespace {
7
8 template<int lane> inline __m128 broadcast_ps(__m128 x) {
9 return _mm_shuffle_ps(x, x, _MM_SHUFFLE(lane, lane, lane, lane));
10 }
11
12 __forceinline void quat_times_vec3_ps(__m128& out_x, __m128& out_y, __m128& out_z,
13 const __m128& q,
14 const __m128& v_x, const __m128& v_y, const __m128& v_z)
15 {
16#if 0
17 // Reference
18 glm::quat rot;
19 rot.x = q.m128_f32[0];
20 rot.y = q.m128_f32[1];
21 rot.z = q.m128_f32[2];
22 rot.w = q.m128_f32[3];
23 for (int i = 0; i < 4; i++) {
24 glm::vec3 v;
25 v.x = v_x.m128_f32[i];
26 v.y = v_y.m128_f32[i];
27 v.z = v_z.m128_f32[i];
28
29 glm::vec3 w = rot * v;
30 out_x.m128_f32[i] = w.x;
31 out_y.m128_f32[i] = w.y;
32 out_z.m128_f32[i] = w.z;
33 }
34 return;
35#endif
36
37 __m128 q_x = _mm_shuffle_ps(q, q, _MM_SHUFFLE(0, 0, 0, 0));
38 __m128 q_y = _mm_shuffle_ps(q, q, _MM_SHUFFLE(1, 1, 1, 1));
39 __m128 q_z = _mm_shuffle_ps(q, q, _MM_SHUFFLE(2, 2, 2, 2));
40 __m128 q_w = _mm_shuffle_ps(q, q, _MM_SHUFFLE(3, 3, 3, 3));
41
42 __m128 uv_x = _mm_sub_ps(_mm_mul_ps(q_y, v_z), _mm_mul_ps(v_y, q_z));
43 __m128 uv_y = _mm_sub_ps(_mm_mul_ps(q_z, v_x), _mm_mul_ps(v_z, q_x));
44 __m128 uv_z = _mm_sub_ps(_mm_mul_ps(q_x, v_y), _mm_mul_ps(v_x, q_y));
45 __m128 uuv_x = _mm_sub_ps(_mm_mul_ps(q_y, uv_z), _mm_mul_ps(uv_y, q_z));
46 __m128 uuv_y = _mm_sub_ps(_mm_mul_ps(q_z, uv_x), _mm_mul_ps(uv_z, q_x));
47 __m128 uuv_z = _mm_sub_ps(_mm_mul_ps(q_x, uv_y), _mm_mul_ps(uv_x, q_y));
48 __m128 t_x = _mm_add_ps(_mm_mul_ps(q_w, uv_x), uuv_x);
49 __m128 t_y = _mm_add_ps(_mm_mul_ps(q_w, uv_y), uuv_y);
50 __m128 t_z = _mm_add_ps(_mm_mul_ps(q_w, uv_z), uuv_z);
51 out_x = _mm_add_ps(v_x, _mm_add_ps(t_x, t_x));
52 out_y = _mm_add_ps(v_y, _mm_add_ps(t_y, t_y));
53 out_z = _mm_add_ps(v_z, _mm_add_ps(t_z, t_z));
54 }
55
56 __forceinline __m128 atan_00155_ps(__m128 x)
57 {
58
59 static const float signBit = -0.f;
60 //static const float C0 = float(3.14159265358979323846264338327950288 / 4.0);
61 //static const float C1 = 0.2447f;
62 //static const float C2 = 0.0663f;
63 //static const float C3 = 1.f;
64
65 static const __m128 C = _mm_setr_ps(float(3.14159265358979323846264338327950288 / 4.0),
66 0.2447f,
67 0.0663f,
68 1.f);
69 const auto c0 = _mm_shuffle_ps(C, C, _MM_SHUFFLE(0, 0, 0, 0));
70 const auto c1 = _mm_shuffle_ps(C, C, _MM_SHUFFLE(1, 1, 1, 1));
71 const auto c2 = _mm_shuffle_ps(C, C, _MM_SHUFFLE(2, 2, 2, 2));
72 const auto c3 = _mm_shuffle_ps(C, C, _MM_SHUFFLE(3, 3, 3, 3));
73
74 __m128 sign = _mm_load1_ps(&signBit);
75 __m128 abs_x = _mm_andnot_ps(sign, x);
76
77 __m128 t0 = _mm_mul_ps(c0, x); // t0 = PI/4 * x
78 __m128 t1 = _mm_sub_ps(abs_x, c3); // t1 = |x|-1
79 __m128 t2 = _mm_mul_ps(abs_x, c2); // t2 = 0.0663*|x|
80 __m128 t3 = _mm_add_ps(c1, t2); // t3 = 0.2447 - t2
81 __m128 t4 = _mm_mul_ps(x, _mm_mul_ps(t1, t3)); // r4 = x*t1*t3
82 __m128 t5 = _mm_sub_ps(t0, t4);
83
84
85 return t5;
86 }
87
88 __forceinline __m128 asin_ps(__m128 x)
89 {
90 static const float signBit = -0.f;
91 static const float asin_deg3_C3 = -0.0187293f;
92 static const float asin_deg3_C2 = 0.0742610f;
93 static const float asin_deg3_C1 = -0.2121144f;
94 static const float asin_deg3_C0 = 1.5707288f;
95 static const __m128 asin_deg3_C = _mm_set_ps(asin_deg3_C3, asin_deg3_C2, asin_deg3_C1, asin_deg3_C0);
96 static const float one = 1.f;
97 static const float piTwo = 1.5707963267948966f;
98
99#if 0
100 // Reference
101 __m128 rv;
102 for (int i = 0; i < 4; i++) {
103 rv.m128_f32[i] = std::asin(x.m128_f32[i]);
104 }
105 return rv;
106#endif
107
108 __m128 sign = _mm_load1_ps(&signBit);
109 __m128 abs_x = _mm_andnot_ps(sign, x);
110
111#if 1
112 // Max error < 2e-5
113 __m128 C = _mm_load_ps((float*)(&asin_deg3_C));
114 __m128 r = _mm_mul_ps(_mm_shuffle_ps(C, C, _MM_SHUFFLE(3, 3, 3, 3)), abs_x);
115 r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(C, C, _MM_SHUFFLE(2, 2, 2, 2)), abs_x));
116 r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(C, C, _MM_SHUFFLE(1, 1, 1, 1)), abs_x));
117 r = _mm_add_ps(r, _mm_shuffle_ps(C, C, _MM_SHUFFLE(0, 0, 0, 0)));
118#elif 1
119 // Max error < 2e-8
120 __m128 r = _mm_mul_ps(_mm_load1_ps(&asin_deg7_C7), abs_x);
121 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C6), abs_x));
122 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C5), abs_x));
123 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C4), abs_x));
124 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C3), abs_x));
125 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C2), abs_x));
126 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C1), abs_x));
127 r = _mm_add_ps(r, _mm_load1_ps(&asin_deg7_C0));
128#endif
129
130 __m128 q = _mm_sub_ps(_mm_load1_ps(&one), abs_x);
131#if 1
132 q = _mm_sqrt_ps(q);
133#else
134 q = _mm_rcp_ps(_mm_rsqrt_ps(q));
135#endif
136
137 r = _mm_sub_ps(_mm_load1_ps(&piTwo), _mm_mul_ps(q, r));
138
139 // copy sign from x
140 return _mm_or_ps(r, _mm_and_ps(x, sign));
141 }
142
143
144 __forceinline __m128 asin_v2_ps(__m128 x)
145 {
146 const __m128 sign_ps = _mm_set1_ps(-0.f);
147 const __m128 one_ps = _mm_set1_ps(1.f);
148 const __m128 halfpi_ps = _mm_set1_ps(1.5707963267948966f);
149 const __m128 C3_ps = _mm_set1_ps(-0.0187293f);
150 const __m128 C2_ps = _mm_set1_ps(0.0742610f);
151 const __m128 C1_ps = _mm_set1_ps(-0.2121144f);
152 const __m128 C0_ps = _mm_set1_ps(1.5707288f);
153
154 __m128 abs_x = _mm_andnot_ps(sign_ps, x);
155
156 __m128 a = _mm_sub_ps(one_ps, abs_x);
157 a = _mm_sqrt_ps(a);
158
159 __m128 b = C3_ps;
160 b = _mm_add_ps(_mm_mul_ps(b, abs_x), C2_ps);
161 b = _mm_add_ps(_mm_mul_ps(b, abs_x), C1_ps);
162 b = _mm_add_ps(_mm_mul_ps(b, abs_x), C0_ps);
163
164 __m128 rv = _mm_sub_ps(halfpi_ps, _mm_mul_ps(a, b));
165
166 rv = _mm_or_ps(rv, _mm_and_ps(x, sign_ps));
167
168 return rv;
169 }
170
171#if 0 //static unittest is no good on systems which does not have sse41 support, but extensions does not have proper unittest support
172#pragma optimize( "", off )
173 static struct UnitTests
174 {
175 UnitTests()
176 {
177 const unsigned N = 1000;
178 for (unsigned i = 0; i < N; i++) {
179 __m128 x, y;
180 x.m128_f32[0] = (2.f / (N - 1))*i - 1.f;
181 y = atan_00155_ps(x);
182 auto e = std::abs(atan(x.m128_f32[0]) - y.m128_f32[0]);
183 assert(e < 0.00155f);
184 }
185
186 for (unsigned i = 0; i < N; i++) {
187 __m128 x, y;
188 x.m128_f32[0] = (2.f / (N - 1))*i - 1.f;
189 y = asin_v2_ps(x);
190 auto e = std::abs(std::asin(x.m128_f32[0]) - y.m128_f32[0]);
191 assert(e < 7e-4f);
192 }
193 }
194 } unitTests;
195#pragma optimize( "", on )
196#endif
197
198}
199
200namespace Cogs::Core::EchoSounder {
201
202 void sampleTile_border_sse41(float * data,
203 const float *v,
204 const glm::vec3 /*tileIndex*/,
205 const glm::uvec3 /*tilePos*/,
206 const glm::uvec3 dataSize,
207 const glm::uvec3 maxIndices,
208 const glm::vec3 tp,
209 const glm::vec3 scale,
210 const glm::vec3 arrayPositionGlobal,
211 const glm::vec4* frustum,
212 const float minDistanceSquared,
213 const float maxDistanceSquared,
214 const glm::quat inverseOrientation,
215 const uint32_t coordSys,
216 const uint32_t minorCount,
217 const uint32_t sampleCount,
218 const glm::vec3 polarScale,
219 const glm::vec3 polarShift)
220 {
221 const __m128 rot_ = _mm_set_ps(inverseOrientation.w, inverseOrientation.z, inverseOrientation.y, inverseOrientation.x);
222 assert((dataSize.x & 3) == 0);
223 assert(coordSys == 1);
224
225 glm::vec3 ban = arrayPositionGlobal - tp;
226
227 for (uint32_t z = 0; z < dataSize.z; z++) {
228 __m128 pz = _mm_mul_ss(_mm_set1_ps(scale.z), _mm_set1_ps((float)z));
229 __m128 qz = _mm_sub_ss(pz, _mm_set1_ps(ban.z));
230 for (uint32_t y = 0; y < dataSize.y; y++) {
231 __m128 py = _mm_mul_ss(_mm_set1_ps(scale.y), _mm_set1_ps((float)y));
232 __m128 qy = _mm_sub_ss(py, _mm_set1_ps(ban.y));
233 __m128 r2_yz = _mm_add_ss(_mm_mul_ss(qy, qy), _mm_mul_ss(qz, qz));
234 __m128 in0_dot_yz = _mm_add_ss(_mm_mul_ss(_mm_set1_ps(frustum[0].y), qy), _mm_mul_ss(_mm_set1_ps(frustum[0].z), qz));
235 __m128 in1_dot_yz = _mm_add_ss(_mm_mul_ss(_mm_set1_ps(frustum[1].y), qy), _mm_mul_ss(_mm_set1_ps(frustum[1].z), qz));
236 __m128 in2_dot_yz = _mm_add_ss(_mm_mul_ss(_mm_set1_ps(frustum[2].y), qy), _mm_mul_ss(_mm_set1_ps(frustum[2].z), qz));
237 __m128 in3_dot_yz = _mm_add_ss(_mm_mul_ss(_mm_set1_ps(frustum[3].y), qy), _mm_mul_ss(_mm_set1_ps(frustum[3].z), qz));
238
239 in0_dot_yz = _mm_shuffle_ps(in0_dot_yz, in0_dot_yz, _MM_SHUFFLE(0, 0, 0, 0));
240 in1_dot_yz = _mm_shuffle_ps(in1_dot_yz, in1_dot_yz, _MM_SHUFFLE(0, 0, 0, 0));
241 in2_dot_yz = _mm_shuffle_ps(in2_dot_yz, in2_dot_yz, _MM_SHUFFLE(0, 0, 0, 0));
242 in3_dot_yz = _mm_shuffle_ps(in3_dot_yz, in3_dot_yz, _MM_SHUFFLE(0, 0, 0, 0));
243 r2_yz = _mm_shuffle_ps(r2_yz, r2_yz, _MM_SHUFFLE(0, 0, 0, 0));
244
245 for (uint32_t x = 0; x < dataSize.x; x += 4) {
246
247 // sample position relative to ping origin
248 __m128 i = _mm_setr_ps((float)(x + 0), (float)(x + 1), (float)(x + 2), (float)(x + 3));
249
250 __m128 qx = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(scale.x), i), _mm_set1_ps(ban.x));
251
252 // squared radius
253 __m128 r2 = _mm_add_ps(_mm_mul_ps(qx, qx), r2_yz);
254
255 // compare q against frustum planes
256 __m128 mask0 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[0].x), qx), in0_dot_yz));
257 __m128 mask1 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[1].x), qx), in1_dot_yz));
258 __m128 mask2 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[2].x), qx), in2_dot_yz));
259 __m128 mask3 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[3].x), qx), in3_dot_yz));
260 __m128 mask4 = _mm_cmple_ps(_mm_set1_ps(minDistanceSquared), r2);
261 __m128 mask5 = _mm_cmple_ps(r2, _mm_set1_ps(maxDistanceSquared));
262 __m128 mask = _mm_and_ps(_mm_and_ps(_mm_and_ps(mask0, mask1),
263 _mm_and_ps(mask2, mask3)),
264 _mm_and_ps(mask4, mask5));
265 int movemask = _mm_movemask_ps(mask);
266 if (movemask == 0) {
267 continue;
268 }
269 // rotate into ping's orientation
270 __m128 ax, ay, az;
271 quat_times_vec3_ps(ax, ay, az,
272 rot_,
273 qx, broadcast_ps<0>(qy), broadcast_ps<0>(qz));
274
275 __m128 r_inv = _mm_rsqrt_ps(r2);
276 __m128 r = _mm_rcp_ps(r_inv);
277
278 // dirX = asin(a.x/r)
279 // dirY = atan(y/z)
280 __m128 dirx = asin_ps(_mm_mul_ps(ax, r_inv));
281 __m128 diry = atan_00155_ps(_mm_mul_ps(ay, _mm_rcp_ps(az)));
282
283 // Figure out interpolation parameters
284 __m128 xi_i = _mm_max_ps(_mm_setzero_ps(), _mm_mul_ps(_mm_set1_ps(polarScale.x), _mm_sub_ps(diry, _mm_set1_ps(polarShift.x))));
285 __m128 xi_j = _mm_max_ps(_mm_setzero_ps(), _mm_mul_ps(_mm_set1_ps(polarScale.y), _mm_sub_ps(dirx, _mm_set1_ps(polarShift.y))));
286 __m128 xi_k = _mm_max_ps(_mm_setzero_ps(), _mm_mul_ps(_mm_set1_ps(polarScale.z), _mm_sub_ps(r, _mm_set1_ps(polarShift.z))));
287 __m128 tau_i = _mm_floor_ps(xi_i);
288 __m128 tau_j = _mm_floor_ps(xi_j);
289 __m128 tau_k = _mm_floor_ps(xi_k);
290 __m128 t_i = _mm_sub_ps(xi_i, tau_i);
291 __m128 t_j = _mm_sub_ps(xi_j, tau_j);
292 __m128 t_k = _mm_sub_ps(xi_k, tau_k);
293
294 __m128 i_i = (_mm_min_ps(_mm_set1_ps((float)maxIndices.x), tau_i));
295 __m128 i_j = (_mm_min_ps(_mm_set1_ps((float)maxIndices.y), tau_j));
296 __m128 i_k = (_mm_min_ps(_mm_set1_ps((float)maxIndices.z), tau_k));
297 __m128 j_i = (_mm_min_ps(_mm_set1_ps((float)maxIndices.x), _mm_add_ps(tau_i, _mm_set1_ps(1.f))));
298 __m128 j_j = (_mm_min_ps(_mm_set1_ps((float)maxIndices.y), _mm_add_ps(tau_j, _mm_set1_ps(1.f))));
299
300 for (unsigned lane = 0; lane < 4; lane++) {
301 if (((movemask >> lane) & 1) == 0) continue;
302#if 0
303 // Same as reference
304 glm::vec3 a(ax.m128_f32[lane],
305 ay.m128_f32[lane],
306 az.m128_f32[lane]);
307
308 float r = std::sqrt(r2.m128_f32[lane]);
309
310 float dirX = std::asin(a.x / r);
311 float dirY = std::atan(a.y / a.z);
312 auto polarPos = glm::vec3(dirY, dirX, r);
313
314 // Initial float index:
315 glm::vec3 xi = glm::max(glm::vec3(0.f), polarScale*(polarPos - polarShift));
316
317 // Figure out interpolation parameters.
318 glm::vec3 tau = glm::floor(xi);
319
320 glm::vec3 t = xi - tau;
321 glm::uvec3 i = glm::min(maxIndices, glm::uvec3(tau));
322 glm::uvec2 j = glm::min(glm::uvec2(maxIndices), glm::uvec2(i) + glm::uvec2(1));
323
324 float val00 = v[(i.y*minorCount + i.x)*sampleCount + i.z];
325 float val01 = v[(j.y*minorCount + i.x)*sampleCount + i.z];
326 float val0 = (1.f - t.y)*val00 + t.y*val01;
327
328 float val10 = v[(i.y*minorCount + j.x)*sampleCount + i.z];
329 float val11 = v[(j.y*minorCount + j.x)*sampleCount + i.z];
330 float val1 = (1.f - t.y)*val10 + t.y*val11;
331
332 float val = (1.f - t.x)*val0 + t.x*val1;
333 uint32_t index = z * dataSize.y*dataSize.x + y * dataSize.x + x + lane;
334 data[index] = val;
335#elif 0
336 // approximate trigonometry
337 glm::vec3 a(ax.m128_f32[lane],
338 ay.m128_f32[lane],
339 az.m128_f32[lane]);
340
341 float r = std::sqrt(r2.m128_f32[lane]);
342
343 float dirX = dirx.m128_f32[lane];
344 float dirY = diry.m128_f32[lane];
345 auto polarPos = glm::vec3(dirY, dirX, r);
346
347 // Initial float index:
348 glm::vec3 xi = glm::max(glm::vec3(0.f), polarScale*(polarPos - polarShift));
349
350 // Figure out interpolation parameters.
351 glm::vec3 tau = glm::floor(xi);
352
353 glm::vec3 t = xi - tau;
354 glm::uvec3 i = glm::min(maxIndices, glm::uvec3(tau));
355 glm::uvec2 j = glm::min(glm::uvec2(maxIndices), glm::uvec2(i) + glm::uvec2(1));
356
357 float val00 = v[(i.y*minorCount + i.x)*sampleCount + i.z];
358 float val01 = v[(j.y*minorCount + i.x)*sampleCount + i.z];
359 float val0 = (1.f - t.y)*val00 + t.y*val01;
360
361 float val10 = v[(i.y*minorCount + j.x)*sampleCount + i.z];
362 float val11 = v[(j.y*minorCount + j.x)*sampleCount + i.z];
363 float val1 = (1.f - t.y)*val10 + t.y*val11;
364
365 float val = (1.f - t.x)*val0 + t.x*val1;
366 uint32_t index = z * dataSize.y*dataSize.x + y * dataSize.x + x + lane;
367 data[index] = val;
368#elif 0
369 glm::vec3 a(ax.m128_f32[lane],
370 ay.m128_f32[lane],
371 az.m128_f32[lane]);
372
373 float r_ = r.m128_f32[lane];
374
375 float dirX = dirx.m128_f32[lane];
376 float dirY = diry.m128_f32[lane];
377 auto polarPos = glm::vec3(dirY, dirX, r_);
378
379 // Initial float index:
380 glm::vec3 xi = glm::max(glm::vec3(0.f), polarScale*(polarPos - polarShift));
381 xi.x = xi_i.m128_f32[lane];
382 xi.y = xi_j.m128_f32[lane];
383 xi.z = xi_k.m128_f32[lane];
384
385 // Figure out interpolation parameters.
386 glm::vec3 tau = glm::floor(xi);
387 //tau.x = tau_i.m128_f32[lane];
388 //tau.y = tau_j.m128_f32[lane];
389 //tau.z = tau_k.m128_f32[lane];
390
391 glm::vec3 t = xi - tau;
392 glm::uvec3 i = glm::min(maxIndices, glm::uvec3(tau));
393 glm::uvec2 j = glm::min(glm::uvec2(maxIndices), glm::uvec2(i) + glm::uvec2(1));
394
395 float val00 = v[(i.y*minorCount + i.x)*sampleCount + i.z];
396 float val01 = v[(j.y*minorCount + i.x)*sampleCount + i.z];
397 float val0 = (1.f - t.y)*val00 + t.y*val01;
398
399 float val10 = v[(i.y*minorCount + j.x)*sampleCount + i.z];
400 float val11 = v[(j.y*minorCount + j.x)*sampleCount + i.z];
401 float val1 = (1.f - t.y)*val10 + t.y*val11;
402
403 float val = (1.f - t.x)*val0 + t.x*val1;
404 uint32_t index = z * dataSize.y*dataSize.x + y * dataSize.x + x + lane;
405 data[index] = val;
406
407#else
408
409 // Figure out interpolation parameters.
410 glm::vec3 tau;// = glm::floor(xi);
411 tau.x = tau_i.m128_f32[lane];
412 tau.y = tau_j.m128_f32[lane];
413 tau.z = tau_k.m128_f32[lane];
414
415 glm::vec3 t; //= xi - tau;
416 t.x = t_i.m128_f32[lane];
417 t.x = t_j.m128_f32[lane];
418 t.x = t_k.m128_f32[lane];
419
420 glm::uvec3 ii;// = glm::min(maxIndices, glm::uvec3(tau));
421 ii.x = unsigned(i_i.m128_f32[lane]);
422 ii.y = unsigned(i_j.m128_f32[lane]);
423 ii.z = unsigned(i_k.m128_f32[lane]);
424
425 glm::uvec2 j;// = glm::min(glm::uvec2(maxIndices), glm::uvec2(i) + glm::uvec2(1));
426 j.x = unsigned(j_i.m128_f32[lane]);
427 j.y = unsigned(j_j.m128_f32[lane]);
428
429
430
431 float val00 = v[(ii.y*minorCount + ii.x)*sampleCount + ii.z];
432 float val01 = v[(j.y*minorCount + ii.x)*sampleCount + ii.z];
433 float val0 = (1.f - t.y)*val00 + t.y*val01;
434
435 float val10 = v[(ii.y*minorCount + j.x)*sampleCount + ii.z];
436 float val11 = v[(j.y*minorCount + j.x)*sampleCount + ii.z];
437 float val1 = (1.f - t.y)*val10 + t.y*val11;
438
439 float val = (1.f - t.x)*val0 + t.x*val1;
440
441 uint32_t index = z * dataSize.y*dataSize.x + y * dataSize.x + x + lane;
442 data[index] = val;
443 //tile.age[index] = float(10e-7*((*m)[0].ping->timestamp - misc.ping->timestamp));
444#endif
445 }
446 }
447 }
448 }
449 }
450
451
452
453 //#pragma optimize( "", off )
454 void sampleTile_inner_sse41(float * data,
455 const float *v,
456 const glm::vec3 /*tileIndex*/,
457 const glm::uvec3 /*tilePos*/,
458 const glm::uvec3 dataSize,
459 const glm::uvec3 maxIndices,
460 const glm::vec3 tp,
461 const glm::vec3 scale,
462 const glm::vec3 arrayPositionGlobal,
463 const glm::vec4* frustum,
464 const float minDistanceSquared,
465 const float maxDistanceSquared,
466 const glm::quat inverseOrientation,
467 const uint32_t coordSys,
468 const uint32_t minorCount,
469 const uint32_t sampleCount,
470 const glm::vec3 polarScale,
471 const glm::vec3 polarShift)
472 {
473 const __m128 rot_ = _mm_set_ps(inverseOrientation.w, inverseOrientation.z, inverseOrientation.y, inverseOrientation.x);
474 assert((dataSize.x & 3) == 0);
475 assert(coordSys == 1);
476
477 static const __m128 c0123 = _mm_setr_ps(0, 1, 2, 3);
478 static const __m128 one_ps = _mm_setr_ps(1.f, 1.f, 1.f, 1.f);
479 glm::vec3 ban = arrayPositionGlobal - tp;
480
481 for (uint32_t z = 0; z < dataSize.z; z++) {
482 __m128 pz = _mm_mul_ss(_mm_set1_ps(scale.z), _mm_set1_ps((float)z));
483 __m128 qz = _mm_sub_ss(pz, _mm_set1_ps(ban.z));
484 for (uint32_t y = 0; y < dataSize.y; y++) {
485 __m128 py = _mm_mul_ss(_mm_set1_ps(scale.y), _mm_set1_ps((float)y));
486 __m128 qy = _mm_sub_ss(py, _mm_set1_ps(ban.y));
487 __m128 r2_yz = _mm_add_ss(_mm_mul_ss(qy, qy), _mm_mul_ss(qz, qz));
488 __m128 in0_dot_yz = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[0].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[0].z), qz));
489 __m128 in1_dot_yz = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[1].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[1].z), qz));
490 __m128 in2_dot_yz = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[2].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[2].z), qz));
491 __m128 in3_dot_yz = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[3].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[3].z), qz));
492
493 in0_dot_yz = _mm_shuffle_ps(in0_dot_yz, in0_dot_yz, _MM_SHUFFLE(0, 0, 0, 0));
494 in1_dot_yz = _mm_shuffle_ps(in1_dot_yz, in1_dot_yz, _MM_SHUFFLE(0, 0, 0, 0));
495 in2_dot_yz = _mm_shuffle_ps(in2_dot_yz, in2_dot_yz, _MM_SHUFFLE(0, 0, 0, 0));
496 in3_dot_yz = _mm_shuffle_ps(in3_dot_yz, in3_dot_yz, _MM_SHUFFLE(0, 0, 0, 0));
497 r2_yz = _mm_shuffle_ps(r2_yz, r2_yz, _MM_SHUFFLE(0, 0, 0, 0));
498
499 for (uint32_t x = 0; x < dataSize.x; x += 4) {
500
501 // sample position relative to ping origin
502 __m128 i = _mm_add_ps(_mm_set1_ps((float)x), c0123);
503 __m128 qx = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(scale.x), i), _mm_set1_ps(ban.x));
504
505 // squared radius
506 __m128 r2 = _mm_add_ps(_mm_mul_ps(qx, qx), r2_yz);
507
508 // compare q against frustum planes
509 __m128 gmask0 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[0].x), qx), in0_dot_yz);
510 __m128 gmask1 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[1].x), qx), in1_dot_yz);
511 __m128 gmask2 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[2].x), qx), in2_dot_yz);
512 __m128 gmask3 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[3].x), qx), in3_dot_yz);
513
514
515 __m128 mask0 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[0].x), qx), in0_dot_yz));
516 __m128 mask1 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[1].x), qx), in1_dot_yz));
517 __m128 mask2 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[2].x), qx), in2_dot_yz));
518 __m128 mask3 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[3].x), qx), in3_dot_yz));
519 __m128 mask4 = _mm_cmple_ps(_mm_set1_ps(minDistanceSquared), r2);
520 __m128 mask5 = _mm_cmple_ps(r2, _mm_set1_ps(maxDistanceSquared));
521 __m128 mask = _mm_and_ps(_mm_and_ps(_mm_and_ps(mask0, mask1),
522 _mm_and_ps(mask2, mask3)),
523 _mm_and_ps(mask4, mask5));
524 //int movemask = _mm_movemask_ps(mask);
525 //if (movemask == 0) {
526 // assert(false);
527 // continue;
528 //}
529 // rotate into ping's orientation
530 __m128 ax, ay, az;
531 quat_times_vec3_ps(ax, ay, az,
532 rot_,
533 qx, broadcast_ps<0>(qy), broadcast_ps<0>(qz));
534
535 __m128 r_inv = _mm_rsqrt_ps(r2);
536 __m128 r = _mm_rcp_ps(r_inv);
537
538 // dirX = asin(a.x/r)
539 // dirY = atan(y/z)
540 __m128 dirx = asin_ps(_mm_mul_ps(ax, r_inv));
541 __m128 diry = atan_00155_ps(_mm_mul_ps(ay, _mm_rcp_ps(az)));
542
543 // Figure out interpolation parameters
544 __m128 xi_i = _mm_max_ps(_mm_setzero_ps(), _mm_mul_ps(_mm_set1_ps(polarScale.x), _mm_sub_ps(diry, _mm_set1_ps(polarShift.x))));
545 __m128 xi_j = _mm_max_ps(_mm_setzero_ps(), _mm_mul_ps(_mm_set1_ps(polarScale.y), _mm_sub_ps(dirx, _mm_set1_ps(polarShift.y))));
546 __m128 xi_k = _mm_max_ps(_mm_setzero_ps(), _mm_mul_ps(_mm_set1_ps(polarScale.z), _mm_sub_ps(r, _mm_set1_ps(polarShift.z))));
547 __m128 tau_i = _mm_floor_ps(xi_i);
548 __m128 tau_j = _mm_floor_ps(xi_j);
549 __m128 tau_k = _mm_floor_ps(xi_k);
550 __m128 t_i = _mm_sub_ps(xi_i, tau_i);
551 __m128 t_j = _mm_sub_ps(xi_j, tau_j);
552 __m128 t_k = _mm_sub_ps(xi_k, tau_k);
553
554 __m128 i_i = (_mm_min_ps(_mm_set1_ps((float)maxIndices.x), tau_i));
555 __m128 i_j = (_mm_min_ps(_mm_set1_ps((float)maxIndices.y), tau_j));
556 __m128 i_k = (_mm_min_ps(_mm_set1_ps((float)maxIndices.z), tau_k));
557 __m128 j_i = (_mm_min_ps(_mm_set1_ps((float)maxIndices.x), _mm_add_ps(tau_i, _mm_set1_ps(1.f))));
558 __m128 j_j = (_mm_min_ps(_mm_set1_ps((float)maxIndices.y), _mm_add_ps(tau_j, _mm_set1_ps(1.f))));
559
560
561 __m128i i_i_ = _mm_cvtps_epi32(i_i);
562 __m128i i_j_ = _mm_cvtps_epi32(i_j);
563 __m128i i_k_ = _mm_cvtps_epi32(i_k);
564
565 __m128i j_i_ = _mm_cvtps_epi32(j_i);
566 __m128i j_j_ = _mm_cvtps_epi32(j_j);
567
568 __m128i ix00 = _mm_add_epi32(_mm_mullo_epi32(_mm_set1_epi32(sampleCount), _mm_add_epi32(_mm_mullo_epi32(_mm_set1_epi32(minorCount), i_j_), i_i_)), i_k_);
569 __m128 val00 = _mm_setr_ps(v[ix00.m128i_u32[0]],
570 v[ix00.m128i_u32[1]],
571 v[ix00.m128i_u32[2]],
572 v[ix00.m128i_u32[3]]);
573
574 __m128i ix01 = _mm_add_epi32(_mm_mullo_epi32(_mm_set1_epi32(sampleCount), _mm_add_epi32(_mm_mullo_epi32(_mm_set1_epi32(minorCount), j_j_), i_i_)), i_k_);
575 __m128 val01 = _mm_setr_ps(v[ix01.m128i_u32[0]],
576 v[ix01.m128i_u32[1]],
577 v[ix01.m128i_u32[2]],
578 v[ix01.m128i_u32[3]]);
579 __m128 val0 = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(one_ps, t_j), val00), _mm_mul_ps(t_j, val01));
580
581
582 __m128i ix10 = _mm_add_epi32(_mm_mullo_epi32(_mm_set1_epi32(sampleCount), _mm_add_epi32(_mm_mullo_epi32(_mm_set1_epi32(minorCount), i_j_), j_i_)), i_k_);
583 __m128 val10 = _mm_setr_ps(v[ix10.m128i_u32[0]],
584 v[ix10.m128i_u32[1]],
585 v[ix10.m128i_u32[2]],
586 v[ix10.m128i_u32[3]]);
587
588 __m128i ix11 = _mm_add_epi32(_mm_mullo_epi32(_mm_set1_epi32(sampleCount), _mm_add_epi32(_mm_mullo_epi32(_mm_set1_epi32(minorCount), j_j_), j_i_)), i_k_);
589 __m128 val11 = _mm_setr_ps(v[ix01.m128i_u32[0]],
590 v[ix01.m128i_u32[1]],
591 v[ix01.m128i_u32[2]],
592 v[ix01.m128i_u32[3]]);
593
594 __m128 val1 = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(one_ps, t_j), val10), _mm_mul_ps(t_j, val11));
595
596 __m128 val_ = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(one_ps, t_i), val0), _mm_mul_ps(t_i, val1));
597
598 uint32_t index = z * dataSize.y*dataSize.x + y * dataSize.x + x;
599 _mm_store_ps(data + index, val_);
600
601
602#if 0
603 for (unsigned lane = 0; lane < 4; lane++) {
604 //if (((movemask >> lane) & 1) == 0) {
605 // continue;
606 //}
607#if 0
608 // Same as reference
609 glm::vec3 a(ax.m128_f32[lane],
610 ay.m128_f32[lane],
611 az.m128_f32[lane]);
612
613 float r = std::sqrt(r2.m128_f32[lane]);
614
615 float dirX = std::asin(a.x / r);
616 float dirY = std::atan(a.y / a.z);
617 auto polarPos = glm::vec3(dirY, dirX, r);
618
619 // Initial float index:
620 glm::vec3 xi = glm::max(glm::vec3(0.f), polarScale*(polarPos - polarShift));
621
622 // Figure out interpolation parameters.
623 glm::vec3 tau = glm::floor(xi);
624
625 glm::vec3 t = xi - tau;
626 glm::uvec3 i = glm::min(maxIndices, glm::uvec3(tau));
627 glm::uvec2 j = glm::min(glm::uvec2(maxIndices), glm::uvec2(i) + glm::uvec2(1));
628
629 float val00 = v[(i.y*minorCount + i.x)*sampleCount + i.z];
630 float val01 = v[(j.y*minorCount + i.x)*sampleCount + i.z];
631 float val0 = (1.f - t.y)*val00 + t.y*val01;
632
633 float val10 = v[(i.y*minorCount + j.x)*sampleCount + i.z];
634 float val11 = v[(j.y*minorCount + j.x)*sampleCount + i.z];
635 float val1 = (1.f - t.y)*val10 + t.y*val11;
636
637 float val = (1.f - t.x)*val0 + t.x*val1;
638 uint32_t index = z * dataSize.y*dataSize.x + y * dataSize.x + x + lane;
639 data[index] = val;
640#else
641 // Figure out interpolation parameters.
642 glm::vec3 tau;// = glm::floor(xi);
643 tau.x = tau_i.m128_f32[lane];
644 tau.y = tau_j.m128_f32[lane];
645 tau.z = tau_k.m128_f32[lane];
646
647 glm::vec3 t; //= xi - tau;
648 t.x = t_i.m128_f32[lane];
649 t.x = t_j.m128_f32[lane];
650 t.x = t_k.m128_f32[lane];
651
652 glm::uvec3 i;// = glm::min(maxIndices, glm::uvec3(tau));
653 i.x = unsigned(i_i.m128_f32[lane]);
654 i.y = unsigned(i_j.m128_f32[lane]);
655 i.z = unsigned(i_k.m128_f32[lane]);
656
657 glm::uvec2 j;// = glm::min(glm::uvec2(maxIndices), glm::uvec2(i) + glm::uvec2(1));
658 j.x = unsigned(j_i.m128_f32[lane]);
659 j.y = unsigned(j_j.m128_f32[lane]);
660
661
662 float val00 = v[ix00.m128i_u32[lane]];
663 float val01 = v[ix01.m128i_u32[lane]];
664 //float val00 = v[(i.y*minorCount + i.x)*sampleCount + i.z];
665 //float val01 = v[(j.y*minorCount + i.x)*sampleCount + i.z];
666 float val0 = (1.f - t.y)*val00 + t.y*val01;
667
668 float val10 = v[ix10.m128i_u32[lane]];
669 float val11 = v[ix11.m128i_u32[lane]];
670 //float val10 = v[(i.y*minorCount + j.x)*sampleCount + i.z];
671 //float val11 = v[(j.y*minorCount + j.x)*sampleCount + i.z];
672 float val1 = (1.f - t.y)*val10 + t.y*val11;
673
674 float val = (1.f - t.x)*val0 + t.x*val1;
675
676 val = val_.m128_f32[lane];
677
678 uint32_t index = z * dataSize.y*dataSize.x + y * dataSize.x + x + lane;
679 data[index] = val;
680#endif
681
682 }
683#endif
684 }
685 }
686 }
687 }
688#pragma optimize( "", on )
689
690}