2#include "SampleVolumeTask.h"
17 static const float piTwo = 1.5707963267948966f;
18 static const float pi = 3.1415926535897931f;
19 static const float signBit = -0.f;
20 static const float one = 1.f;
22 static const float asin_deg3_C3 = -0.0187293f;
23 static const float asin_deg3_C2 = 0.0742610f;
24 static const float asin_deg3_C1 = -0.2121144f;
25 static const float asin_deg3_C0 = 1.5707288f;
26 static const __m128 asin_deg3_C = _mm_set_ps(asin_deg3_C3, asin_deg3_C2, asin_deg3_C1, asin_deg3_C0);
28 static const float asin_deg7_C7 = -0.0012624911f;
29 static const float asin_deg7_C6 = 0.0066700901f;
30 static const float asin_deg7_C5 = -0.0170881256f;
31 static const float asin_deg7_C4 = 0.0308918810f;
32 static const float asin_deg7_C3 = -0.0501743046f;
33 static const float asin_deg7_C2 = 0.0889789874f;
34 static const float asin_deg7_C1 = -0.2145988016f;
35 static const float asin_deg7_C0 = 1.5707963050f;
38 template<
int lane>
inline __m128 broadcast_ps(__m128 x) {
39 return _mm_shuffle_ps(x, x, _MM_SHUFFLE(lane, lane, lane, lane));
42 __m128 asin_ps(__m128 x)
47 for (
int i = 0; i < 4; i++) {
48 rv.m128_f32[i] = std::asin(x.m128_f32[i]);
53 __m128 sign = _mm_load1_ps(&signBit);
54 __m128 abs_x = _mm_andnot_ps(sign, x);
58 __m128 C = _mm_load_ps((
float*)(&asin_deg3_C));
59 __m128 r = _mm_mul_ps(_mm_shuffle_ps(C, C, _MM_SHUFFLE(3, 3, 3, 3)), abs_x);
60 r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(C, C, _MM_SHUFFLE(2, 2, 2, 2)), abs_x));
61 r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(C, C, _MM_SHUFFLE(1, 1, 1, 1)), abs_x));
62 r = _mm_add_ps(r, _mm_shuffle_ps(C, C, _MM_SHUFFLE(0, 0, 0, 0)));
65 __m128 r = _mm_mul_ps(_mm_load1_ps(&asin_deg7_C7), abs_x);
66 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C6), abs_x));
67 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C5), abs_x));
68 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C4), abs_x));
69 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C3), abs_x));
70 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C2), abs_x));
71 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C1), abs_x));
72 r = _mm_add_ps(r, _mm_load1_ps(&asin_deg7_C0));
75 __m128 q = _mm_sub_ps(_mm_load1_ps(&one), abs_x);
79 q = _mm_rcp_ps(_mm_rsqrt_ps(q));
82 r = _mm_sub_ps(_mm_load1_ps(&piTwo), _mm_mul_ps(q, r));
85 return _mm_or_ps(r, _mm_and_ps(x, sign));
88 void quat_times_vec3_ps(__m128& out_x, __m128& out_y, __m128& out_z,
90 const __m128& v_x,
const __m128& v_y,
const __m128& v_z)
95 rot.x = q.m128_f32[0];
96 rot.y = q.m128_f32[1];
97 rot.z = q.m128_f32[2];
98 rot.w = q.m128_f32[3];
99 for (
int i = 0; i < 4; i++) {
101 v.x = v_x.m128_f32[i];
102 v.y = v_y.m128_f32[i];
103 v.z = v_z.m128_f32[i];
106 out_x.m128_f32[i] = w.x;
107 out_y.m128_f32[i] = w.y;
108 out_z.m128_f32[i] = w.z;
113 __m128 q_x = _mm_shuffle_ps(q, q, _MM_SHUFFLE(0, 0, 0, 0));
114 __m128 q_y = _mm_shuffle_ps(q, q, _MM_SHUFFLE(1, 1, 1, 1));
115 __m128 q_z = _mm_shuffle_ps(q, q, _MM_SHUFFLE(2, 2, 2, 2));
116 __m128 q_w = _mm_shuffle_ps(q, q, _MM_SHUFFLE(3, 3, 3, 3));
118 __m128 uv_x = _mm_sub_ps(_mm_mul_ps(q_y, v_z), _mm_mul_ps(v_y, q_z));
119 __m128 uv_y = _mm_sub_ps(_mm_mul_ps(q_z, v_x), _mm_mul_ps(v_z, q_x));
120 __m128 uv_z = _mm_sub_ps(_mm_mul_ps(q_x, v_y), _mm_mul_ps(v_x, q_y));
121 __m128 uuv_x = _mm_sub_ps(_mm_mul_ps(q_y, uv_z), _mm_mul_ps(uv_y, q_z));
122 __m128 uuv_y = _mm_sub_ps(_mm_mul_ps(q_z, uv_x), _mm_mul_ps(uv_z, q_x));
123 __m128 uuv_z = _mm_sub_ps(_mm_mul_ps(q_x, uv_y), _mm_mul_ps(uv_x, q_y));
124 __m128 t_x = _mm_add_ps(_mm_mul_ps(q_w, uv_x), uuv_x);
125 __m128 t_y = _mm_add_ps(_mm_mul_ps(q_w, uv_y), uuv_y);
126 __m128 t_z = _mm_add_ps(_mm_mul_ps(q_w, uv_z), uuv_z);
127 out_x = _mm_add_ps(v_x, _mm_add_ps(t_x, t_x));
128 out_y = _mm_add_ps(v_y, _mm_add_ps(t_y, t_y));
129 out_z = _mm_add_ps(v_z, _mm_add_ps(t_z, t_z));
135void EchoSounder::SampleVolumeTask2::runSSE4_1(TileValue * dstValues,
136 uint16_t * timeValues,
137 const EchoSounder::PingInfo* pinf,
138 const uint32_t upperFansToRemove,
139 const uint32_t numPings,
140 const uint32_t numSamples,
141 const uint32_t gridSizeX,
142 const uint32_t gridSizeY,
143 const float depthOffset,
144 const float depthStep,
145 const float sampleSpacing,
147 const std::vector<float>& beamAngleAlongship,
148 const std::vector<float>& beamAngleAthwartship,
149 const glm::ivec3& this_minIndex,
150 const glm::ivec3& this_maxIndex)
152 const auto time = pinf->time;
153 const auto weight = 1.f;
154 const auto numBeamsMajor =
static_cast<uint32_t
>(beamAngleAlongship.size());
155 const auto numBeamsMinor =
static_cast<uint32_t
>(beamAngleAthwartship.size());
156 const auto s = 1.f / sampleSpacing;
159 const vec3 minPolar(beamAngleAthwartship.front(),
160 beamAngleAlongship.front(),
162 const vec3 maxPolar(beamAngleAthwartship.back(),
163 beamAngleAlongship.back(),
164 depthOffset + (numSamples - 1)*depthStep);
167 const uvec3 maxTau(max(2u, numBeamsMinor) - 2,
168 max(2u + upperFansToRemove, numBeamsMajor) - 2 - upperFansToRemove,
169 max(2u, numSamples) - 2);
172 const vec3 polarToIndex = vec3(max(2u, numBeamsMinor) - 2,
173 max(2u, numBeamsMajor) - 2,
174 max(2u, numSamples) - 2) / (maxPolar - minPolar);
176 const auto * minorMu = beamAngleAthwartship.data();
177 const auto * majorMu = beamAngleAlongship.data();
179 for (uint32_t p = 0; p < numPings; p++) {
181 const auto minIndex = this_minIndex;
182 const auto maxIndex = this_maxIndex;
183 const auto * srcValues = pinf[p].field;
185 const auto rot = inverse(pinf[p].metaPing.arrayOrientationGlobal);
186 const __m128 rot_ = _mm_set_ps(rot.w, rot.z, rot.y, rot.x);
187 const auto shift = pinf[p].arrayPositionTile;
189 const __m128 n_x = _mm_set_ps(
190 pinf[p].boundingFrustumNormals[3].x,
191 pinf[p].boundingFrustumNormals[2].x,
192 pinf[p].boundingFrustumNormals[1].x,
193 pinf[p].boundingFrustumNormals[0].x
195 for (
int k = minIndex.z; k < maxIndex.z; k++) {
197 float samplePosGrid_z = sampleSpacing*k;
198 if (samplePosGrid_z < pinf[p].depthRestriction)
continue;
200 for (
int j = minIndex.y; j < maxIndex.y; j++) {
201 float samplePosGrid_y = sampleSpacing*j;
203 const auto l_z = sampleSpacing*k - shift.z;
204 const auto l_y = sampleSpacing*j - shift.y;
205 const __m128 eq_a = _mm_set_ps(
206 pinf[p].boundingFrustumNormals[3].z * l_z + pinf[p].boundingFrustumNormals[3].y*l_y,
207 pinf[p].boundingFrustumNormals[2].z * l_z + pinf[p].boundingFrustumNormals[2].y*l_y,
208 pinf[p].boundingFrustumNormals[1].z * l_z + pinf[p].boundingFrustumNormals[1].y*l_y,
209 pinf[p].boundingFrustumNormals[0].z * l_z + pinf[p].boundingFrustumNormals[0].y*l_y
211 const auto l_zy_2 = l_z*l_z + l_y*l_y;
213 for (
int i = minIndex.x; i < maxIndex.x; i+=4) {
215 __m128 ii = _mm_set_ps(
static_cast<float>(i + 3),
216 static_cast<float>(i + 2),
217 static_cast<float>(i + 1),
218 static_cast<float>(i + 0));
219 __m128 mask = _mm_cmple_ps(ii, _mm_set1_ps(
static_cast<float>(maxIndex.x)));
220 __m128 l_x = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(sampleSpacing), ii), _mm_set1_ps(shift.x));
221 __m128 r2_ = _mm_add_ps(_mm_set1_ps(l_zy_2), _mm_mul_ps(l_x, l_x));
224 __m128 mask_0 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(broadcast_ps<0>(eq_a), _mm_mul_ps(broadcast_ps<0>(n_x), l_x)));
225 __m128 mask_1 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(broadcast_ps<1>(eq_a), _mm_mul_ps(broadcast_ps<1>(n_x), l_x)));
226 __m128 mask_2 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(broadcast_ps<2>(eq_a), _mm_mul_ps(broadcast_ps<2>(n_x), l_x)));
227 __m128 mask_3 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(broadcast_ps<3>(eq_a), _mm_mul_ps(broadcast_ps<3>(n_x), l_x)));
228 __m128 mask_4 = _mm_cmple_ps(_mm_set1_ps(pinf[p].minDepthSquared), r2_);
229 __m128 mask_5 = _mm_cmple_ps(r2_, _mm_set1_ps(pinf[p].maxDepthSquared));
230 mask = _mm_and_ps(_mm_and_ps(_mm_and_ps(mask_0, mask_1),
231 _mm_and_ps(mask_2, mask_3)),
232 _mm_and_ps(_mm_and_ps(mask_4, mask_5),
234 if (_mm_movemask_ps(mask) == 0) {
238 __m128 cartesian_x, cartesian_y, cartesian_z;
239 quat_times_vec3_ps(cartesian_x, cartesian_y, cartesian_z,
241 l_x, _mm_set1_ps(l_y), _mm_set1_ps(l_z));
242 __m128 rep_r = _mm_rsqrt_ps(r2_);
243 __m128 r_ = _mm_rcp_ps(rep_r);
246 float samplePosGrid_x = sampleSpacing*i;
247 const auto samplePosArray = (glm::vec3(samplePosGrid_x, samplePosGrid_y, samplePosGrid_z) - shift);
248 const auto r2 = glm::dot(samplePosArray, samplePosArray);
249 const auto cartesianPos = rot * samplePosArray;
250 float r = std::sqrt(r2);
252 __m128 dirX = asin_ps(_mm_mul_ps(cartesian_x, rep_r));
253 __m128 dirY = asin_ps(_mm_mul_ps(cartesian_y, rep_r));
256 __m128 xi_x = _mm_mul_ps(_mm_set1_ps(polarToIndex.x),
257 _mm_sub_ps(dirY, _mm_set1_ps(minPolar.x)));
258 __m128 xi_y = _mm_mul_ps(_mm_set1_ps(polarToIndex.y),
259 _mm_sub_ps(dirX, _mm_set1_ps(minPolar.y)));
260 __m128 xi_z = _mm_mul_ps(_mm_set1_ps(polarToIndex.z),
261 _mm_sub_ps(r_, _mm_set1_ps(minPolar.z)));
263 __m128 tau_x = _mm_floor_ps(xi_x);
264 __m128 tau_y = _mm_floor_ps(xi_y);
265 __m128 tau_z = _mm_floor_ps(xi_z);
267 __m128i tau_x_vui = _mm_cvtps_epi32(tau_x);
268 __m128i tau_y_vui = _mm_cvtps_epi32(tau_y);
269 __m128i tau_z_vui = _mm_cvtps_epi32(tau_z);
270 __m128 bx_ = _mm_sub_ps(xi_x, tau_x);
271 __m128 by_ = _mm_sub_ps(xi_y, tau_y);
273 auto upper_x = _mm_set1_epi32(maxTau.x);
274 auto inrange_x = _mm_cmpeq_epi32(_mm_max_epu32(tau_x_vui, upper_x), upper_x);
276 auto upper_y = _mm_set1_epi32(maxTau.y);
277 auto inrange_y = _mm_cmpeq_epi32(_mm_max_epu32(tau_y_vui, upper_y), upper_y);
279 auto upper_z = _mm_set1_epi32(maxTau.z);
280 auto inrange_z = _mm_cmpeq_epi32(_mm_max_epu32(tau_z_vui, upper_z), upper_z);
282 mask = _mm_and_ps(_mm_and_ps(mask, _mm_castsi128_ps(inrange_x)),
283 _mm_and_ps(_mm_castsi128_ps(inrange_y), _mm_castsi128_ps(inrange_z)));
285 int movemask = _mm_movemask_ps(mask);
286 if (movemask == 0)
continue;
288 for (uint32_t lane = 0; lane < 4; lane++) {
289 if (((movemask >> lane) & 1) == 0)
continue;
291 uint32_t ix_x = tau_x_vui.m128i_u32[lane];
292 uint32_t ix_y = tau_y_vui.m128i_u32[lane];
293 uint32_t ix_z = tau_z_vui.m128i_u32[lane];
296 float bx = bx_.m128_f32[lane];
297 float by = by_.m128_f32[lane];
299 const auto val00 = srcValues[((ix_y + 0)*numBeamsMinor + (ix_x + 0))*numSamples + ix_z];
300 const auto val01 = srcValues[((ix_y + 1)*numBeamsMinor + (ix_x + 0))*numSamples + ix_z];
301 const auto val0 = (1.f - by)*PingValue_Decode(val00) + by*PingValue_Decode(val01);
303 const auto val10 = srcValues[((ix_y + 0)*numBeamsMinor + (ix_x + 1))*numSamples + ix_z];
304 const auto val11 = srcValues[((ix_y + 1)*numBeamsMinor + (ix_x + 1))*numSamples + ix_z];
305 const auto val1 = (1.f - by)*PingValue_Decode(val10) + by*PingValue_Decode(val11);
308 const auto val = (1.f - bx)*val0 + bx*val1;
310 auto dstOffset = (k*gridSizeY + j)*gridSizeX + i + lane;
313 auto oldValue = TileValue_Decode(dstValues[dstOffset]);
314 if (oldValue == 0.0) oldValue = val;
316 dstValues[dstOffset] = TileValue_Encode(0.5f*oldValue + 0.5f*val);
318 timeValues[dstOffset] = time;
Contains the Engine, Renderer, resource managers and other systems needed to run Cogs....