Cogs.Core
SampleVolumeTask_AVX.cpp
1#if 0
2#ifdef COGS_EXTENSIONS_AVX
3#include <intrin.h>
4#include "SampleVolumeTask.h"
5
6using namespace Cogs::Core;
7
8using glm::uvec3;
9using glm::ivec3;
10using glm::vec3;
11using glm::clamp;
12using glm::max;
13using glm::min;
14using glm::floor;
15using glm::ceil;
16using glm::inverse;
17
18namespace {
19 static const float piTwo = 1.5707963267948966f;
20 static const float pi = 3.1415926535897931f;
21 static const float signBit = -0.f;
22 static const float one = 1.f;
23
24 static const float asin_deg3_C3 = -0.0187293f;
25 static const float asin_deg3_C2 = 0.0742610f;
26 static const float asin_deg3_C1 = -0.2121144f;
27 static const float asin_deg3_C0 = 1.5707288f;
28 static const __m128 asin_deg3_C = _mm_set_ps(asin_deg3_C3, asin_deg3_C2, asin_deg3_C1, asin_deg3_C0);
29
30 static const float asin_deg7_C7 = -0.0012624911f;
31 static const float asin_deg7_C6 = 0.0066700901f;
32 static const float asin_deg7_C5 = -0.0170881256f;
33 static const float asin_deg7_C4 = 0.0308918810f;
34 static const float asin_deg7_C3 = -0.0501743046f;
35 static const float asin_deg7_C2 = 0.0889789874f;
36 static const float asin_deg7_C1 = -0.2145988016f;
37 static const float asin_deg7_C0 = 1.5707963050f;
38
39 __m256 asin_ps(__m256 x)
40 {
41 __m256 sign = _mm256_set1_ps(signBit);
42 __m256 abs_x = _mm256_andnot_ps(sign, x);
43
44#if 1
45 // Max error < 2e-5
46 __m256 C = _mm256_broadcast_ps(&asin_deg3_C);
47 __m256 r = _mm256_mul_ps(_mm256_permute_ps(C, 0XFF), abs_x);
48 r = _mm256_add_ps(r, _mm256_mul_ps(_mm256_permute_ps(C, 0XAA), abs_x));
49 r = _mm256_add_ps(r, _mm256_mul_ps(_mm256_permute_ps(C, 0X55), abs_x));
50 r = _mm256_add_ps(r, _mm256_permute_ps(C, 0X00));
51#elif 1
52 // Max error < 2e-8
53 __m128 r = _mm_mul_ps(_mm_load1_ps(&asin_deg7_C7), abs_x);
54 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C6), abs_x));
55 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C5), abs_x));
56 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C4), abs_x));
57 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C3), abs_x));
58 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C2), abs_x));
59 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C1), abs_x));
60 r = _mm_add_ps(r, _mm_load1_ps(&asin_deg7_C0));
61#endif
62
63 __m256 q = _mm256_sub_ps(_mm256_set1_ps(one), abs_x);
64#if 0
65 q = _mm256_sqrt_ps(q);
66#else
67 q = _mm256_rcp_ps(_mm256_rsqrt_ps(q));
68#endif
69
70 r = _mm256_sub_ps(_mm256_set1_ps(piTwo), _mm256_mul_ps(q, r));
71
72 // copy sign from x
73 return _mm256_or_ps(r, _mm256_and_ps(x, sign));
74 }
75
76 void quat_times_vec3_ps(__m256& out_x, __m256& out_y, __m256& out_z,
77 const __m128* q,
78 const __m256& v_x, const __m256& v_y, const __m256& v_z)
79 {
80 __m256 q2 = _mm256_broadcast_ps(q);
81 __m256 q_x = _mm256_permute_ps(q2, 0x00);
82 __m256 q_y = _mm256_permute_ps(q2, 0x55);
83 __m256 q_z = _mm256_permute_ps(q2, 0xAA);
84 __m256 q_w = _mm256_permute_ps(q2, 0xFF);
85
86 __m256 uv_x = _mm256_sub_ps(_mm256_mul_ps(q_y, v_z), _mm256_mul_ps(v_y, q_z));
87 __m256 uv_y = _mm256_sub_ps(_mm256_mul_ps(q_z, v_x), _mm256_mul_ps(v_z, q_x));
88 __m256 uv_z = _mm256_sub_ps(_mm256_mul_ps(q_x, v_y), _mm256_mul_ps(v_x, q_y));
89 __m256 uuv_x = _mm256_sub_ps(_mm256_mul_ps(q_y, uv_z), _mm256_mul_ps(uv_y, q_z));
90 __m256 uuv_y = _mm256_sub_ps(_mm256_mul_ps(q_z, uv_x), _mm256_mul_ps(uv_z, q_x));
91 __m256 uuv_z = _mm256_sub_ps(_mm256_mul_ps(q_x, uv_y), _mm256_mul_ps(uv_x, q_y));
92 __m256 t_x = _mm256_add_ps(_mm256_mul_ps(q_w, uv_x), uuv_x);
93 __m256 t_y = _mm256_add_ps(_mm256_mul_ps(q_w, uv_y), uuv_y);
94 __m256 t_z = _mm256_add_ps(_mm256_mul_ps(q_w, uv_z), uuv_z);
95 out_x = _mm256_add_ps(v_x, _mm256_add_ps(t_x, t_x));
96 out_y = _mm256_add_ps(v_y, _mm256_add_ps(t_y, t_y));
97 out_z = _mm256_add_ps(v_z, _mm256_add_ps(t_z, t_z));
98 }
99
100}
101
102
103void EchoSounder::SampleVolumeTask2::runAVX2(TileValue * dstValues,
104 uint16_t * timeValues,
105 const EchoSounder::PingInfo* pinf,
106 const uint32_t upperFansToRemove,
107 const uint32_t numPings,
108 const uint32_t numSamples,
109 const uint32_t gridSizeX,
110 const uint32_t gridSizeY,
111 const float depthOffset,
112 const float depthStep,
113 const float sampleSpacing,
114 const float decay,
115 const std::vector<float>& beamAngleAlongship,
116 const std::vector<float>& beamAngleAthwartship,
117 const glm::ivec3& this_minIndex,
118 const glm::ivec3& this_maxIndex)
119{
120 const auto time = pinf->time;
121 const auto weight = 1.f;// / data.numPings;
122 const auto numBeamsMajor = static_cast<uint32_t>(beamAngleAlongship.size());
123 const auto numBeamsMinor = static_cast<uint32_t>(beamAngleAthwartship.size());
124 const auto s = 1.f / sampleSpacing;
125 //const auto width = data.cellWidth;
126
127 const vec3 minPolar(beamAngleAthwartship.front(),
128 beamAngleAlongship.front(),
129 depthOffset);
130 const vec3 maxPolar(beamAngleAthwartship.back(),
131 beamAngleAlongship.back(),
132 depthOffset + (numSamples - 1)*depthStep);
133
134
135 const uvec3 maxTau(max(2u, numBeamsMinor) - 2,
136 max(2u + upperFansToRemove, numBeamsMajor) - 2 - upperFansToRemove,
137 max(2u, numSamples) - 2);
138
139 // map minPolar to 0 and maxPolar to N-1.
140 const vec3 polarToIndex = vec3(max(2u, numBeamsMinor) - 2,
141 max(2u, numBeamsMajor) - 2,
142 max(2u, numSamples) - 2) / (maxPolar - minPolar);
143
144 const auto * minorMu = beamAngleAthwartship.data();
145 const auto * majorMu = beamAngleAlongship.data();
146
147 for (uint32_t p = 0; p < numPings; p++) {
148
149 const auto minIndex = this_minIndex;// max(this_minIndex, ivec3(floor(s * pinf[p].boundingBoxTile[0])));
150 const auto maxIndex = this_maxIndex;// min(this_maxIndex, ivec3(ceil(s * pinf[p].boundingBoxTile[1])));
151 const auto * srcValues = pinf[p].field;
152
153 const auto rot = inverse(pinf[p].metaPing.arrayOrientationGlobal);
154 const __m128 rot_ = _mm_set_ps(rot.w, rot.z, rot.y, rot.x);
155 const auto shift = pinf[p].arrayPositionTile;
156
157 const __m128 n_x = _mm_set_ps(
158 pinf[p].boundingFrustumNormals[3].x,
159 pinf[p].boundingFrustumNormals[2].x,
160 pinf[p].boundingFrustumNormals[1].x,
161 pinf[p].boundingFrustumNormals[0].x
162 );
163 for (int k = minIndex.z; k < maxIndex.z; k++) {
164
165 float samplePosGrid_z = sampleSpacing*k;
166 if (samplePosGrid_z < pinf[p].depthRestriction) continue;
167
168 for (int j = minIndex.y; j < maxIndex.y; j++) {
169 float samplePosGrid_y = sampleSpacing*j;
170
171 const auto l_z = sampleSpacing*k - shift.z;
172 const auto l_y = sampleSpacing*j - shift.y;
173 const __m128 eq_a = _mm_set_ps(
174 pinf[p].boundingFrustumNormals[3].z * l_z + pinf[p].boundingFrustumNormals[3].y*l_y,
175 pinf[p].boundingFrustumNormals[2].z * l_z + pinf[p].boundingFrustumNormals[2].y*l_y,
176 pinf[p].boundingFrustumNormals[1].z * l_z + pinf[p].boundingFrustumNormals[1].y*l_y,
177 pinf[p].boundingFrustumNormals[0].z * l_z + pinf[p].boundingFrustumNormals[0].y*l_y
178 );
179 const auto l_zy_2 = l_z*l_z + l_y*l_y;
180
181 for (int i = minIndex.x; i < maxIndex.x; i += 8) {
182 __m256 ii = _mm256_set_ps(static_cast<float>(i + 7),
183 static_cast<float>(i + 6),
184 static_cast<float>(i + 5),
185 static_cast<float>(i + 4),
186 static_cast<float>(i + 3),
187 static_cast<float>(i + 2),
188 static_cast<float>(i + 1),
189 static_cast<float>(i + 0));
190
191 __m256 mask = _mm256_cmp_ps(ii, _mm256_set1_ps(static_cast<float>(maxIndex.x)), _CMP_LE_OQ);
192 __m256 l_x = _mm256_sub_ps(_mm256_mul_ps(_mm256_set1_ps(sampleSpacing), ii), _mm256_set1_ps(shift.x));
193 __m256 r2_ = _mm256_add_ps(_mm256_set1_ps(l_zy_2), _mm256_mul_ps(l_x, l_x));
194
195 // Early exit: check against frustum planes
196 __m256 eq_a2 = _mm256_broadcast_ps(&eq_a); // double copies of eq_a
197 __m256 n_x2 = _mm256_broadcast_ps(&n_x); // double copies of n_x
198
199 __m256 mask_0 = _mm256_cmp_ps(_mm256_setzero_ps(), _mm256_add_ps(_mm256_permute_ps(eq_a2, 0x00), _mm256_mul_ps(_mm256_permute_ps(n_x2, 0x00), l_x)), _CMP_LE_OQ);
200 __m256 mask_1 = _mm256_cmp_ps(_mm256_setzero_ps(), _mm256_add_ps(_mm256_permute_ps(eq_a2, 0x55), _mm256_mul_ps(_mm256_permute_ps(n_x2, 0x55), l_x)), _CMP_LE_OQ);
201 __m256 mask_2 = _mm256_cmp_ps(_mm256_setzero_ps(), _mm256_add_ps(_mm256_permute_ps(eq_a2, 0xAA), _mm256_mul_ps(_mm256_permute_ps(n_x2, 0xAA), l_x)), _CMP_LE_OQ);
202 __m256 mask_3 = _mm256_cmp_ps(_mm256_setzero_ps(), _mm256_add_ps(_mm256_permute_ps(eq_a2, 0xFF), _mm256_mul_ps(_mm256_permute_ps(n_x2, 0xFF), l_x)), _CMP_LE_OQ);
203 __m256 mask_4 = _mm256_cmp_ps(_mm256_set1_ps(pinf[p].minDepthSquared), r2_, _CMP_LE_OQ);
204 __m256 mask_5 = _mm256_cmp_ps(r2_, _mm256_set1_ps(pinf[p].maxDepthSquared), _CMP_LE_OQ);
205 mask = _mm256_and_ps(_mm256_and_ps(_mm256_and_ps(mask_0, mask_1),
206 _mm256_and_ps(mask_2, mask_3)),
207 _mm256_and_ps(_mm256_and_ps(mask_4, mask_5),
208 mask));
209 if (_mm256_movemask_ps(mask) == 0) {
210 continue;
211 }
212
213 __m256 cartesian_x, cartesian_y, cartesian_z;
214 quat_times_vec3_ps(cartesian_x, cartesian_y, cartesian_z,
215 &rot_,
216 l_x, _mm256_set1_ps(l_y), _mm256_set1_ps(l_z));
217 __m256 rep_r = _mm256_rsqrt_ps(r2_);
218 __m256 r_ = _mm256_rcp_ps(rep_r);
219
220 __m256 dirX = asin_ps(_mm256_mul_ps(cartesian_x, rep_r)); //std::asin(cartesianPos.x / r);
221 __m256 dirY = asin_ps(_mm256_mul_ps(cartesian_y, rep_r)); //std::asin(cartesianPos.y / r);
222
223
224 // calculate xi (source indices with fractional part).
225 __m256 xi_x = _mm256_mul_ps(_mm256_set1_ps(polarToIndex.x),
226 _mm256_sub_ps(dirY, _mm256_set1_ps(minPolar.x)));
227 __m256 xi_y = _mm256_mul_ps(_mm256_set1_ps(polarToIndex.y),
228 _mm256_sub_ps(dirX, _mm256_set1_ps(minPolar.y)));
229 __m256 xi_z = _mm256_mul_ps(_mm256_set1_ps(polarToIndex.z),
230 _mm256_sub_ps(r_, _mm256_set1_ps(minPolar.z)));
231
232 __m256 tau_x = _mm256_floor_ps(xi_x);
233 __m256 tau_y = _mm256_floor_ps(xi_y);
234 __m256 tau_z = _mm256_floor_ps(xi_z);
235
236
237 __m256i tau_x_vui = _mm256_cvtps_epi32(tau_x);
238 __m256i tau_y_vui = _mm256_cvtps_epi32(tau_y);
239 __m256i tau_z_vui = _mm256_cvtps_epi32(tau_z);
240 __m256 bx_ = _mm256_sub_ps(xi_x, tau_x);
241 __m256 by_ = _mm256_sub_ps(xi_y, tau_y);
242
243
244
245 __m256i upper_x = _mm256_set1_epi32(maxTau.x);
246 __m256i inrange_x = _mm256_cmpeq_epi32(_mm256_max_epu32(tau_x_vui, upper_x), upper_x);
247
248 __m256i upper_y = _mm256_set1_epi32(maxTau.y);
249 __m256i inrange_y = _mm256_cmpeq_epi32(_mm256_max_epu32(tau_y_vui, upper_y), upper_y);
250
251 __m256i upper_z = _mm256_set1_epi32(maxTau.z);
252 __m256i inrange_z = _mm256_cmpeq_epi32(_mm256_max_epu32(tau_z_vui, upper_z), upper_z);
253
254 mask = _mm256_and_ps(_mm256_and_ps(mask, _mm256_castsi256_ps(inrange_x)),
255 _mm256_and_ps(_mm256_castsi256_ps(inrange_y), _mm256_castsi256_ps(inrange_z)));
256
257 int movemask = _mm256_movemask_ps(mask);
258 if (movemask == 0) continue;
259
260 for (uint32_t lane = 0; lane < 8; lane++) {
261 if (((movemask >> lane) & 1) == 0) continue;
262
263 uint32_t ix_x = tau_x_vui.m256i_u32[lane];
264 uint32_t ix_y = tau_y_vui.m256i_u32[lane];
265 uint32_t ix_z = tau_z_vui.m256i_u32[lane];
266
267 // Bi-linear interpolation, nearest nb along depth.
268 float bx = bx_.m256_f32[lane];//xi.x - tau.x;
269 float by = by_.m256_f32[lane];//xi.y - tau.y;
270
271 const auto val00 = srcValues[((ix_y + 0)*numBeamsMinor + (ix_x + 0))*numSamples + ix_z];
272 const auto val01 = srcValues[((ix_y + 1)*numBeamsMinor + (ix_x + 0))*numSamples + ix_z];
273 const auto val0 = (1.f - by)*PingValue_Decode(val00) + by*PingValue_Decode(val01);
274
275 const auto val10 = srcValues[((ix_y + 0)*numBeamsMinor + (ix_x + 1))*numSamples + ix_z];
276 const auto val11 = srcValues[((ix_y + 1)*numBeamsMinor + (ix_x + 1))*numSamples + ix_z];
277 const auto val1 = (1.f - by)*PingValue_Decode(val10) + by*PingValue_Decode(val11);
278
279 //const auto val = 10000000.f;//val11;// (1.f - bx)*val0 + bx*val1;
280 const auto val = (1.f - bx)*val0 + bx*val1;
281
282 auto dstOffset = (k*gridSizeY + j)*gridSizeX + i + lane;
283
284
285 auto oldValue = TileValue_Decode(dstValues[dstOffset]);
286 //if (oldValue == 0.0) oldValue = val;
287
288 auto age = static_cast<float>(time - timeValues[dstOffset]);
289 auto w = std::pow(decay, age);
290
291 dstValues[dstOffset] = TileValue_Encode(w*oldValue + (1.f-decay)*val);
292 //dstValues[dstOffset] = max(oldValue, val);
293 timeValues[dstOffset] = time;
294 }
295 }
296 }
297 }
298 }
299}
300#endif
301#endif
Contains the Engine, Renderer, resource managers and other systems needed to run Cogs....