2#ifdef COGS_EXTENSIONS_AVX
4#include "SampleVolumeTask.h"
19 static const float piTwo = 1.5707963267948966f;
20 static const float pi = 3.1415926535897931f;
21 static const float signBit = -0.f;
22 static const float one = 1.f;
24 static const float asin_deg3_C3 = -0.0187293f;
25 static const float asin_deg3_C2 = 0.0742610f;
26 static const float asin_deg3_C1 = -0.2121144f;
27 static const float asin_deg3_C0 = 1.5707288f;
28 static const __m128 asin_deg3_C = _mm_set_ps(asin_deg3_C3, asin_deg3_C2, asin_deg3_C1, asin_deg3_C0);
30 static const float asin_deg7_C7 = -0.0012624911f;
31 static const float asin_deg7_C6 = 0.0066700901f;
32 static const float asin_deg7_C5 = -0.0170881256f;
33 static const float asin_deg7_C4 = 0.0308918810f;
34 static const float asin_deg7_C3 = -0.0501743046f;
35 static const float asin_deg7_C2 = 0.0889789874f;
36 static const float asin_deg7_C1 = -0.2145988016f;
37 static const float asin_deg7_C0 = 1.5707963050f;
39 __m256 asin_ps(__m256 x)
41 __m256 sign = _mm256_set1_ps(signBit);
42 __m256 abs_x = _mm256_andnot_ps(sign, x);
46 __m256 C = _mm256_broadcast_ps(&asin_deg3_C);
47 __m256 r = _mm256_mul_ps(_mm256_permute_ps(C, 0XFF), abs_x);
48 r = _mm256_add_ps(r, _mm256_mul_ps(_mm256_permute_ps(C, 0XAA), abs_x));
49 r = _mm256_add_ps(r, _mm256_mul_ps(_mm256_permute_ps(C, 0X55), abs_x));
50 r = _mm256_add_ps(r, _mm256_permute_ps(C, 0X00));
53 __m128 r = _mm_mul_ps(_mm_load1_ps(&asin_deg7_C7), abs_x);
54 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C6), abs_x));
55 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C5), abs_x));
56 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C4), abs_x));
57 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C3), abs_x));
58 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C2), abs_x));
59 r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C1), abs_x));
60 r = _mm_add_ps(r, _mm_load1_ps(&asin_deg7_C0));
63 __m256 q = _mm256_sub_ps(_mm256_set1_ps(one), abs_x);
65 q = _mm256_sqrt_ps(q);
67 q = _mm256_rcp_ps(_mm256_rsqrt_ps(q));
70 r = _mm256_sub_ps(_mm256_set1_ps(piTwo), _mm256_mul_ps(q, r));
73 return _mm256_or_ps(r, _mm256_and_ps(x, sign));
76 void quat_times_vec3_ps(__m256& out_x, __m256& out_y, __m256& out_z,
78 const __m256& v_x,
const __m256& v_y,
const __m256& v_z)
80 __m256 q2 = _mm256_broadcast_ps(q);
81 __m256 q_x = _mm256_permute_ps(q2, 0x00);
82 __m256 q_y = _mm256_permute_ps(q2, 0x55);
83 __m256 q_z = _mm256_permute_ps(q2, 0xAA);
84 __m256 q_w = _mm256_permute_ps(q2, 0xFF);
86 __m256 uv_x = _mm256_sub_ps(_mm256_mul_ps(q_y, v_z), _mm256_mul_ps(v_y, q_z));
87 __m256 uv_y = _mm256_sub_ps(_mm256_mul_ps(q_z, v_x), _mm256_mul_ps(v_z, q_x));
88 __m256 uv_z = _mm256_sub_ps(_mm256_mul_ps(q_x, v_y), _mm256_mul_ps(v_x, q_y));
89 __m256 uuv_x = _mm256_sub_ps(_mm256_mul_ps(q_y, uv_z), _mm256_mul_ps(uv_y, q_z));
90 __m256 uuv_y = _mm256_sub_ps(_mm256_mul_ps(q_z, uv_x), _mm256_mul_ps(uv_z, q_x));
91 __m256 uuv_z = _mm256_sub_ps(_mm256_mul_ps(q_x, uv_y), _mm256_mul_ps(uv_x, q_y));
92 __m256 t_x = _mm256_add_ps(_mm256_mul_ps(q_w, uv_x), uuv_x);
93 __m256 t_y = _mm256_add_ps(_mm256_mul_ps(q_w, uv_y), uuv_y);
94 __m256 t_z = _mm256_add_ps(_mm256_mul_ps(q_w, uv_z), uuv_z);
95 out_x = _mm256_add_ps(v_x, _mm256_add_ps(t_x, t_x));
96 out_y = _mm256_add_ps(v_y, _mm256_add_ps(t_y, t_y));
97 out_z = _mm256_add_ps(v_z, _mm256_add_ps(t_z, t_z));
103void EchoSounder::SampleVolumeTask2::runAVX2(TileValue * dstValues,
104 uint16_t * timeValues,
105 const EchoSounder::PingInfo* pinf,
106 const uint32_t upperFansToRemove,
107 const uint32_t numPings,
108 const uint32_t numSamples,
109 const uint32_t gridSizeX,
110 const uint32_t gridSizeY,
111 const float depthOffset,
112 const float depthStep,
113 const float sampleSpacing,
115 const std::vector<float>& beamAngleAlongship,
116 const std::vector<float>& beamAngleAthwartship,
117 const glm::ivec3& this_minIndex,
118 const glm::ivec3& this_maxIndex)
120 const auto time = pinf->time;
121 const auto weight = 1.f;
122 const auto numBeamsMajor =
static_cast<uint32_t
>(beamAngleAlongship.size());
123 const auto numBeamsMinor =
static_cast<uint32_t
>(beamAngleAthwartship.size());
124 const auto s = 1.f / sampleSpacing;
127 const vec3 minPolar(beamAngleAthwartship.front(),
128 beamAngleAlongship.front(),
130 const vec3 maxPolar(beamAngleAthwartship.back(),
131 beamAngleAlongship.back(),
132 depthOffset + (numSamples - 1)*depthStep);
135 const uvec3 maxTau(max(2u, numBeamsMinor) - 2,
136 max(2u + upperFansToRemove, numBeamsMajor) - 2 - upperFansToRemove,
137 max(2u, numSamples) - 2);
140 const vec3 polarToIndex = vec3(max(2u, numBeamsMinor) - 2,
141 max(2u, numBeamsMajor) - 2,
142 max(2u, numSamples) - 2) / (maxPolar - minPolar);
144 const auto * minorMu = beamAngleAthwartship.data();
145 const auto * majorMu = beamAngleAlongship.data();
147 for (uint32_t p = 0; p < numPings; p++) {
149 const auto minIndex = this_minIndex;
150 const auto maxIndex = this_maxIndex;
151 const auto * srcValues = pinf[p].field;
153 const auto rot = inverse(pinf[p].metaPing.arrayOrientationGlobal);
154 const __m128 rot_ = _mm_set_ps(rot.w, rot.z, rot.y, rot.x);
155 const auto shift = pinf[p].arrayPositionTile;
157 const __m128 n_x = _mm_set_ps(
158 pinf[p].boundingFrustumNormals[3].x,
159 pinf[p].boundingFrustumNormals[2].x,
160 pinf[p].boundingFrustumNormals[1].x,
161 pinf[p].boundingFrustumNormals[0].x
163 for (
int k = minIndex.z; k < maxIndex.z; k++) {
165 float samplePosGrid_z = sampleSpacing*k;
166 if (samplePosGrid_z < pinf[p].depthRestriction)
continue;
168 for (
int j = minIndex.y; j < maxIndex.y; j++) {
169 float samplePosGrid_y = sampleSpacing*j;
171 const auto l_z = sampleSpacing*k - shift.z;
172 const auto l_y = sampleSpacing*j - shift.y;
173 const __m128 eq_a = _mm_set_ps(
174 pinf[p].boundingFrustumNormals[3].z * l_z + pinf[p].boundingFrustumNormals[3].y*l_y,
175 pinf[p].boundingFrustumNormals[2].z * l_z + pinf[p].boundingFrustumNormals[2].y*l_y,
176 pinf[p].boundingFrustumNormals[1].z * l_z + pinf[p].boundingFrustumNormals[1].y*l_y,
177 pinf[p].boundingFrustumNormals[0].z * l_z + pinf[p].boundingFrustumNormals[0].y*l_y
179 const auto l_zy_2 = l_z*l_z + l_y*l_y;
181 for (
int i = minIndex.x; i < maxIndex.x; i += 8) {
182 __m256 ii = _mm256_set_ps(
static_cast<float>(i + 7),
183 static_cast<float>(i + 6),
184 static_cast<float>(i + 5),
185 static_cast<float>(i + 4),
186 static_cast<float>(i + 3),
187 static_cast<float>(i + 2),
188 static_cast<float>(i + 1),
189 static_cast<float>(i + 0));
191 __m256 mask = _mm256_cmp_ps(ii, _mm256_set1_ps(
static_cast<float>(maxIndex.x)), _CMP_LE_OQ);
192 __m256 l_x = _mm256_sub_ps(_mm256_mul_ps(_mm256_set1_ps(sampleSpacing), ii), _mm256_set1_ps(shift.x));
193 __m256 r2_ = _mm256_add_ps(_mm256_set1_ps(l_zy_2), _mm256_mul_ps(l_x, l_x));
196 __m256 eq_a2 = _mm256_broadcast_ps(&eq_a);
197 __m256 n_x2 = _mm256_broadcast_ps(&n_x);
199 __m256 mask_0 = _mm256_cmp_ps(_mm256_setzero_ps(), _mm256_add_ps(_mm256_permute_ps(eq_a2, 0x00), _mm256_mul_ps(_mm256_permute_ps(n_x2, 0x00), l_x)), _CMP_LE_OQ);
200 __m256 mask_1 = _mm256_cmp_ps(_mm256_setzero_ps(), _mm256_add_ps(_mm256_permute_ps(eq_a2, 0x55), _mm256_mul_ps(_mm256_permute_ps(n_x2, 0x55), l_x)), _CMP_LE_OQ);
201 __m256 mask_2 = _mm256_cmp_ps(_mm256_setzero_ps(), _mm256_add_ps(_mm256_permute_ps(eq_a2, 0xAA), _mm256_mul_ps(_mm256_permute_ps(n_x2, 0xAA), l_x)), _CMP_LE_OQ);
202 __m256 mask_3 = _mm256_cmp_ps(_mm256_setzero_ps(), _mm256_add_ps(_mm256_permute_ps(eq_a2, 0xFF), _mm256_mul_ps(_mm256_permute_ps(n_x2, 0xFF), l_x)), _CMP_LE_OQ);
203 __m256 mask_4 = _mm256_cmp_ps(_mm256_set1_ps(pinf[p].minDepthSquared), r2_, _CMP_LE_OQ);
204 __m256 mask_5 = _mm256_cmp_ps(r2_, _mm256_set1_ps(pinf[p].maxDepthSquared), _CMP_LE_OQ);
205 mask = _mm256_and_ps(_mm256_and_ps(_mm256_and_ps(mask_0, mask_1),
206 _mm256_and_ps(mask_2, mask_3)),
207 _mm256_and_ps(_mm256_and_ps(mask_4, mask_5),
209 if (_mm256_movemask_ps(mask) == 0) {
213 __m256 cartesian_x, cartesian_y, cartesian_z;
214 quat_times_vec3_ps(cartesian_x, cartesian_y, cartesian_z,
216 l_x, _mm256_set1_ps(l_y), _mm256_set1_ps(l_z));
217 __m256 rep_r = _mm256_rsqrt_ps(r2_);
218 __m256 r_ = _mm256_rcp_ps(rep_r);
220 __m256 dirX = asin_ps(_mm256_mul_ps(cartesian_x, rep_r));
221 __m256 dirY = asin_ps(_mm256_mul_ps(cartesian_y, rep_r));
225 __m256 xi_x = _mm256_mul_ps(_mm256_set1_ps(polarToIndex.x),
226 _mm256_sub_ps(dirY, _mm256_set1_ps(minPolar.x)));
227 __m256 xi_y = _mm256_mul_ps(_mm256_set1_ps(polarToIndex.y),
228 _mm256_sub_ps(dirX, _mm256_set1_ps(minPolar.y)));
229 __m256 xi_z = _mm256_mul_ps(_mm256_set1_ps(polarToIndex.z),
230 _mm256_sub_ps(r_, _mm256_set1_ps(minPolar.z)));
232 __m256 tau_x = _mm256_floor_ps(xi_x);
233 __m256 tau_y = _mm256_floor_ps(xi_y);
234 __m256 tau_z = _mm256_floor_ps(xi_z);
237 __m256i tau_x_vui = _mm256_cvtps_epi32(tau_x);
238 __m256i tau_y_vui = _mm256_cvtps_epi32(tau_y);
239 __m256i tau_z_vui = _mm256_cvtps_epi32(tau_z);
240 __m256 bx_ = _mm256_sub_ps(xi_x, tau_x);
241 __m256 by_ = _mm256_sub_ps(xi_y, tau_y);
245 __m256i upper_x = _mm256_set1_epi32(maxTau.x);
246 __m256i inrange_x = _mm256_cmpeq_epi32(_mm256_max_epu32(tau_x_vui, upper_x), upper_x);
248 __m256i upper_y = _mm256_set1_epi32(maxTau.y);
249 __m256i inrange_y = _mm256_cmpeq_epi32(_mm256_max_epu32(tau_y_vui, upper_y), upper_y);
251 __m256i upper_z = _mm256_set1_epi32(maxTau.z);
252 __m256i inrange_z = _mm256_cmpeq_epi32(_mm256_max_epu32(tau_z_vui, upper_z), upper_z);
254 mask = _mm256_and_ps(_mm256_and_ps(mask, _mm256_castsi256_ps(inrange_x)),
255 _mm256_and_ps(_mm256_castsi256_ps(inrange_y), _mm256_castsi256_ps(inrange_z)));
257 int movemask = _mm256_movemask_ps(mask);
258 if (movemask == 0)
continue;
260 for (uint32_t lane = 0; lane < 8; lane++) {
261 if (((movemask >> lane) & 1) == 0)
continue;
263 uint32_t ix_x = tau_x_vui.m256i_u32[lane];
264 uint32_t ix_y = tau_y_vui.m256i_u32[lane];
265 uint32_t ix_z = tau_z_vui.m256i_u32[lane];
268 float bx = bx_.m256_f32[lane];
269 float by = by_.m256_f32[lane];
271 const auto val00 = srcValues[((ix_y + 0)*numBeamsMinor + (ix_x + 0))*numSamples + ix_z];
272 const auto val01 = srcValues[((ix_y + 1)*numBeamsMinor + (ix_x + 0))*numSamples + ix_z];
273 const auto val0 = (1.f - by)*PingValue_Decode(val00) + by*PingValue_Decode(val01);
275 const auto val10 = srcValues[((ix_y + 0)*numBeamsMinor + (ix_x + 1))*numSamples + ix_z];
276 const auto val11 = srcValues[((ix_y + 1)*numBeamsMinor + (ix_x + 1))*numSamples + ix_z];
277 const auto val1 = (1.f - by)*PingValue_Decode(val10) + by*PingValue_Decode(val11);
280 const auto val = (1.f - bx)*val0 + bx*val1;
282 auto dstOffset = (k*gridSizeY + j)*gridSizeX + i + lane;
285 auto oldValue = TileValue_Decode(dstValues[dstOffset]);
288 auto age =
static_cast<float>(time - timeValues[dstOffset]);
289 auto w = std::pow(decay, age);
291 dstValues[dstOffset] = TileValue_Encode(w*oldValue + (1.f-decay)*val);
293 timeValues[dstOffset] = time;
Contains the Engine, Renderer, resource managers and other systems needed to run Cogs....