Cogs.Core
SampleVolumeTask_SSE.cpp
1#if 0
2#include "SampleVolumeTask.h"
3
4using namespace Cogs::Core;
5
6using glm::uvec3;
7using glm::ivec3;
8using glm::vec3;
9using glm::clamp;
10using glm::max;
11using glm::min;
12using glm::floor;
13using glm::ceil;
14using glm::inverse;
15
16namespace {
17 static const float piTwo = 1.5707963267948966f;
18 static const float pi = 3.1415926535897931f;
19 static const float signBit = -0.f;
20 static const float one = 1.f;
21
22 static const float asin_deg3_C3 = -0.0187293f;
23 static const float asin_deg3_C2 = 0.0742610f;
24 static const float asin_deg3_C1 = -0.2121144f;
25 static const float asin_deg3_C0 = 1.5707288f;
26 static const __m128 asin_deg3_C = _mm_set_ps(asin_deg3_C3, asin_deg3_C2, asin_deg3_C1, asin_deg3_C0);
27
28 static const float asin_deg7_C7 = -0.0012624911f;
29 static const float asin_deg7_C6 = 0.0066700901f;
30 static const float asin_deg7_C5 = -0.0170881256f;
31 static const float asin_deg7_C4 = 0.0308918810f;
32 static const float asin_deg7_C3 = -0.0501743046f;
33 static const float asin_deg7_C2 = 0.0889789874f;
34 static const float asin_deg7_C1 = -0.2145988016f;
35 static const float asin_deg7_C0 = 1.5707963050f;
36
37
38 template<int lane> inline __m128 broadcast_ps(__m128 x) {
39 return _mm_shuffle_ps(x, x, _MM_SHUFFLE(lane, lane, lane, lane));
40 }
41
42 __m128 asin_ps(__m128 x)
43 {
44#if 0
45 // Reference
46 __m128 rv;
47 for (int i = 0; i < 4; i++) {
48 rv.m128_f32[i] = std::asin(x.m128_f32[i]);
49 }
50 return rv;
51#endif
52
53 __m128 sign = _mm_load1_ps(&signBit);
54 __m128 abs_x = _mm_andnot_ps(sign, x);
55
56#if 1
57 // Max error < 2e-5
58 __m128 C = _mm_load_ps((float*)(&asin_deg3_C));
59 __m128 r = _mm_mul_ps(_mm_shuffle_ps(C, C, _MM_SHUFFLE(3, 3, 3, 3)), abs_x);
60 r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(C, C, _MM_SHUFFLE(2, 2, 2, 2)), abs_x));
61 r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(C, C, _MM_SHUFFLE(1, 1, 1, 1)), abs_x));
62 r = _mm_add_ps(r, _mm_shuffle_ps(C, C, _MM_SHUFFLE(0, 0, 0, 0)));
63#elif 1
64 // Max error < 2e-8
65 __m128 r = _mm_mul_ps(_mm_load1_ps(&asin_deg7_C7), abs_x);
66 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C6), abs_x));
67 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C5), abs_x));
68 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C4), abs_x));
69 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C3), abs_x));
70 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C2), abs_x));
71 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C1), abs_x));
72 r = _mm_add_ps(r, _mm_load1_ps(&asin_deg7_C0));
73#endif
74
75 __m128 q = _mm_sub_ps(_mm_load1_ps(&one), abs_x);
76#if 0
77 q = _mm_sqrt_ps(q);
78#else
79 q = _mm_rcp_ps(_mm_rsqrt_ps(q));
80#endif
81
82 r = _mm_sub_ps(_mm_load1_ps(&piTwo), _mm_mul_ps(q, r));
83
84 // copy sign from x
85 return _mm_or_ps(r, _mm_and_ps(x, sign));
86 }
87
88 void quat_times_vec3_ps(__m128& out_x, __m128& out_y, __m128& out_z,
89 const __m128& q,
90 const __m128& v_x, const __m128& v_y, const __m128& v_z)
91 {
92#if 0
93 // Reference
94 glm::quat rot;
95 rot.x = q.m128_f32[0];
96 rot.y = q.m128_f32[1];
97 rot.z = q.m128_f32[2];
98 rot.w = q.m128_f32[3];
99 for (int i = 0; i < 4; i++) {
100 glm::vec3 v;
101 v.x = v_x.m128_f32[i];
102 v.y = v_y.m128_f32[i];
103 v.z = v_z.m128_f32[i];
104
105 glm::vec3 w = rot*v;
106 out_x.m128_f32[i] = w.x;
107 out_y.m128_f32[i] = w.y;
108 out_z.m128_f32[i] = w.z;
109 }
110 return;
111#endif
112
113 __m128 q_x = _mm_shuffle_ps(q, q, _MM_SHUFFLE(0, 0, 0, 0));
114 __m128 q_y = _mm_shuffle_ps(q, q, _MM_SHUFFLE(1, 1, 1, 1));
115 __m128 q_z = _mm_shuffle_ps(q, q, _MM_SHUFFLE(2, 2, 2, 2));
116 __m128 q_w = _mm_shuffle_ps(q, q, _MM_SHUFFLE(3, 3, 3, 3));
117
118 __m128 uv_x = _mm_sub_ps(_mm_mul_ps(q_y, v_z), _mm_mul_ps(v_y, q_z));
119 __m128 uv_y = _mm_sub_ps(_mm_mul_ps(q_z, v_x), _mm_mul_ps(v_z, q_x));
120 __m128 uv_z = _mm_sub_ps(_mm_mul_ps(q_x, v_y), _mm_mul_ps(v_x, q_y));
121 __m128 uuv_x = _mm_sub_ps(_mm_mul_ps(q_y, uv_z), _mm_mul_ps(uv_y, q_z));
122 __m128 uuv_y = _mm_sub_ps(_mm_mul_ps(q_z, uv_x), _mm_mul_ps(uv_z, q_x));
123 __m128 uuv_z = _mm_sub_ps(_mm_mul_ps(q_x, uv_y), _mm_mul_ps(uv_x, q_y));
124 __m128 t_x = _mm_add_ps(_mm_mul_ps(q_w, uv_x), uuv_x);
125 __m128 t_y = _mm_add_ps(_mm_mul_ps(q_w, uv_y), uuv_y);
126 __m128 t_z = _mm_add_ps(_mm_mul_ps(q_w, uv_z), uuv_z);
127 out_x = _mm_add_ps(v_x, _mm_add_ps(t_x, t_x));
128 out_y = _mm_add_ps(v_y, _mm_add_ps(t_y, t_y));
129 out_z = _mm_add_ps(v_z, _mm_add_ps(t_z, t_z));
130 }
131
132}
133
134
135void EchoSounder::SampleVolumeTask2::runSSE4_1(TileValue * dstValues,
136 uint16_t * timeValues,
137 const EchoSounder::PingInfo* pinf,
138 const uint32_t upperFansToRemove,
139 const uint32_t numPings,
140 const uint32_t numSamples,
141 const uint32_t gridSizeX,
142 const uint32_t gridSizeY,
143 const float depthOffset,
144 const float depthStep,
145 const float sampleSpacing,
146 const float decay,
147 const std::vector<float>& beamAngleAlongship,
148 const std::vector<float>& beamAngleAthwartship,
149 const glm::ivec3& this_minIndex,
150 const glm::ivec3& this_maxIndex)
151{
152 const auto time = pinf->time;
153 const auto weight = 1.f;// / data.numPings;
154 const auto numBeamsMajor = static_cast<uint32_t>(beamAngleAlongship.size());
155 const auto numBeamsMinor = static_cast<uint32_t>(beamAngleAthwartship.size());
156 const auto s = 1.f / sampleSpacing;
157 //const auto width = data.cellWidth;
158
159 const vec3 minPolar(beamAngleAthwartship.front(),
160 beamAngleAlongship.front(),
161 depthOffset);
162 const vec3 maxPolar(beamAngleAthwartship.back(),
163 beamAngleAlongship.back(),
164 depthOffset + (numSamples - 1)*depthStep);
165
166
167 const uvec3 maxTau(max(2u, numBeamsMinor) - 2,
168 max(2u + upperFansToRemove, numBeamsMajor) - 2 - upperFansToRemove,
169 max(2u, numSamples) - 2);
170
171 // map minPolar to 0 and maxPolar to N-1.
172 const vec3 polarToIndex = vec3(max(2u, numBeamsMinor) - 2,
173 max(2u, numBeamsMajor) - 2,
174 max(2u, numSamples) - 2) / (maxPolar - minPolar);
175
176 const auto * minorMu = beamAngleAthwartship.data();
177 const auto * majorMu = beamAngleAlongship.data();
178
179 for (uint32_t p = 0; p < numPings; p++) {
180
181 const auto minIndex = this_minIndex;// max(this_minIndex, ivec3(floor(s * pinf[p].boundingBoxTile[0])));
182 const auto maxIndex = this_maxIndex;// min(this_maxIndex, ivec3(ceil(s * pinf[p].boundingBoxTile[1])));
183 const auto * srcValues = pinf[p].field;
184
185 const auto rot = inverse(pinf[p].metaPing.arrayOrientationGlobal);
186 const __m128 rot_ = _mm_set_ps(rot.w, rot.z, rot.y, rot.x);
187 const auto shift = pinf[p].arrayPositionTile;
188
189 const __m128 n_x = _mm_set_ps(
190 pinf[p].boundingFrustumNormals[3].x,
191 pinf[p].boundingFrustumNormals[2].x,
192 pinf[p].boundingFrustumNormals[1].x,
193 pinf[p].boundingFrustumNormals[0].x
194 );
195 for (int k = minIndex.z; k < maxIndex.z; k++) {
196
197 float samplePosGrid_z = sampleSpacing*k;
198 if (samplePosGrid_z < pinf[p].depthRestriction) continue;
199
200 for (int j = minIndex.y; j < maxIndex.y; j++) {
201 float samplePosGrid_y = sampleSpacing*j;
202
203 const auto l_z = sampleSpacing*k - shift.z;
204 const auto l_y = sampleSpacing*j - shift.y;
205 const __m128 eq_a = _mm_set_ps(
206 pinf[p].boundingFrustumNormals[3].z * l_z + pinf[p].boundingFrustumNormals[3].y*l_y,
207 pinf[p].boundingFrustumNormals[2].z * l_z + pinf[p].boundingFrustumNormals[2].y*l_y,
208 pinf[p].boundingFrustumNormals[1].z * l_z + pinf[p].boundingFrustumNormals[1].y*l_y,
209 pinf[p].boundingFrustumNormals[0].z * l_z + pinf[p].boundingFrustumNormals[0].y*l_y
210 );
211 const auto l_zy_2 = l_z*l_z + l_y*l_y;
212
213 for (int i = minIndex.x; i < maxIndex.x; i+=4) {
214
215 __m128 ii = _mm_set_ps(static_cast<float>(i + 3),
216 static_cast<float>(i + 2),
217 static_cast<float>(i + 1),
218 static_cast<float>(i + 0));
219 __m128 mask = _mm_cmple_ps(ii, _mm_set1_ps(static_cast<float>(maxIndex.x)));
220 __m128 l_x = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(sampleSpacing), ii), _mm_set1_ps(shift.x));
221 __m128 r2_ = _mm_add_ps(_mm_set1_ps(l_zy_2), _mm_mul_ps(l_x, l_x));
222
223 // Early exit: check against frustum planes
224 __m128 mask_0 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(broadcast_ps<0>(eq_a), _mm_mul_ps(broadcast_ps<0>(n_x), l_x)));
225 __m128 mask_1 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(broadcast_ps<1>(eq_a), _mm_mul_ps(broadcast_ps<1>(n_x), l_x)));
226 __m128 mask_2 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(broadcast_ps<2>(eq_a), _mm_mul_ps(broadcast_ps<2>(n_x), l_x)));
227 __m128 mask_3 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(broadcast_ps<3>(eq_a), _mm_mul_ps(broadcast_ps<3>(n_x), l_x)));
228 __m128 mask_4 = _mm_cmple_ps(_mm_set1_ps(pinf[p].minDepthSquared), r2_);
229 __m128 mask_5 = _mm_cmple_ps(r2_, _mm_set1_ps(pinf[p].maxDepthSquared));
230 mask = _mm_and_ps(_mm_and_ps(_mm_and_ps(mask_0, mask_1),
231 _mm_and_ps(mask_2, mask_3)),
232 _mm_and_ps(_mm_and_ps(mask_4, mask_5),
233 mask));
234 if (_mm_movemask_ps(mask) == 0) {
235 continue;
236 }
237
238 __m128 cartesian_x, cartesian_y, cartesian_z;
239 quat_times_vec3_ps(cartesian_x, cartesian_y, cartesian_z,
240 rot_,
241 l_x, _mm_set1_ps(l_y), _mm_set1_ps(l_z));
242 __m128 rep_r = _mm_rsqrt_ps(r2_);
243 __m128 r_ = _mm_rcp_ps(rep_r);
244
245
246 float samplePosGrid_x = sampleSpacing*i;
247 const auto samplePosArray = (glm::vec3(samplePosGrid_x, samplePosGrid_y, samplePosGrid_z) - shift);
248 const auto r2 = glm::dot(samplePosArray, samplePosArray);
249 const auto cartesianPos = rot * samplePosArray;
250 float r = std::sqrt(r2);
251
252 __m128 dirX = asin_ps(_mm_mul_ps(cartesian_x, rep_r)); //std::asin(cartesianPos.x / r);
253 __m128 dirY = asin_ps(_mm_mul_ps(cartesian_y, rep_r)); //std::asin(cartesianPos.y / r);
254
255 // calculate xi (source indices with fractional part).
256 __m128 xi_x = _mm_mul_ps(_mm_set1_ps(polarToIndex.x),
257 _mm_sub_ps(dirY, _mm_set1_ps(minPolar.x)));
258 __m128 xi_y = _mm_mul_ps(_mm_set1_ps(polarToIndex.y),
259 _mm_sub_ps(dirX, _mm_set1_ps(minPolar.y)));
260 __m128 xi_z = _mm_mul_ps(_mm_set1_ps(polarToIndex.z),
261 _mm_sub_ps(r_, _mm_set1_ps(minPolar.z)));
262
263 __m128 tau_x = _mm_floor_ps(xi_x); // floor is SSE4.1
264 __m128 tau_y = _mm_floor_ps(xi_y);
265 __m128 tau_z = _mm_floor_ps(xi_z);
266
267 __m128i tau_x_vui = _mm_cvtps_epi32(tau_x);
268 __m128i tau_y_vui = _mm_cvtps_epi32(tau_y);
269 __m128i tau_z_vui = _mm_cvtps_epi32(tau_z);
270 __m128 bx_ = _mm_sub_ps(xi_x, tau_x);
271 __m128 by_ = _mm_sub_ps(xi_y, tau_y);
272
273 auto upper_x = _mm_set1_epi32(maxTau.x);
274 auto inrange_x = _mm_cmpeq_epi32(_mm_max_epu32(tau_x_vui, upper_x), upper_x);
275
276 auto upper_y = _mm_set1_epi32(maxTau.y);
277 auto inrange_y = _mm_cmpeq_epi32(_mm_max_epu32(tau_y_vui, upper_y), upper_y);
278
279 auto upper_z = _mm_set1_epi32(maxTau.z);
280 auto inrange_z = _mm_cmpeq_epi32(_mm_max_epu32(tau_z_vui, upper_z), upper_z);
281
282 mask = _mm_and_ps(_mm_and_ps(mask, _mm_castsi128_ps(inrange_x)),
283 _mm_and_ps(_mm_castsi128_ps(inrange_y), _mm_castsi128_ps(inrange_z)));
284
285 int movemask = _mm_movemask_ps(mask);
286 if (movemask == 0) continue;
287
288 for (uint32_t lane = 0; lane < 4; lane++) {
289 if (((movemask >> lane) & 1) == 0) continue;
290
291 uint32_t ix_x = tau_x_vui.m128i_u32[lane];
292 uint32_t ix_y = tau_y_vui.m128i_u32[lane];
293 uint32_t ix_z = tau_z_vui.m128i_u32[lane];
294
295 // Bi-linear interpolation, nearest nb along depth.
296 float bx = bx_.m128_f32[lane];//xi.x - tau.x;
297 float by = by_.m128_f32[lane];//xi.y - tau.y;
298
299 const auto val00 = srcValues[((ix_y + 0)*numBeamsMinor + (ix_x + 0))*numSamples + ix_z];
300 const auto val01 = srcValues[((ix_y + 1)*numBeamsMinor + (ix_x + 0))*numSamples + ix_z];
301 const auto val0 = (1.f - by)*PingValue_Decode(val00) + by*PingValue_Decode(val01);
302
303 const auto val10 = srcValues[((ix_y + 0)*numBeamsMinor + (ix_x + 1))*numSamples + ix_z];
304 const auto val11 = srcValues[((ix_y + 1)*numBeamsMinor + (ix_x + 1))*numSamples + ix_z];
305 const auto val1 = (1.f - by)*PingValue_Decode(val10) + by*PingValue_Decode(val11);
306
307 //const auto val = 10000000.f;//val11;// (1.f - bx)*val0 + bx*val1;
308 const auto val = (1.f - bx)*val0 + bx*val1;
309
310 auto dstOffset = (k*gridSizeY + j)*gridSizeX + i + lane;
311
312
313 auto oldValue = TileValue_Decode(dstValues[dstOffset]);
314 if (oldValue == 0.0) oldValue = val;
315
316 dstValues[dstOffset] = TileValue_Encode(0.5f*oldValue + 0.5f*val);
317 //dstValues[dstOffset] = max(oldValue, val);
318 timeValues[dstOffset] = time;
319 }
320 }
321 }
322 }
323 }
324}
325#endif
Contains the Engine, Renderer, resource managers and other systems needed to run Cogs....