Cogs.Core
UniformGridSystem_sample_fma_gather_avx2.cpp
1
2#include <glm/glm.hpp>
3#include <glm/gtc/quaternion.hpp>
4
5namespace {
6
7 template<int lane> inline __m128 broadcast_ps(__m128 x) {
8 return _mm_shuffle_ps(x, x, _MM_SHUFFLE(lane, lane, lane, lane));
9 }
10
11 __forceinline void quat_times_fma_vec3_ps(__m128& out_x, __m128& out_y, __m128& out_z,
12 const __m128& q,
13 const __m128& v_x, const __m128& v_y, const __m128& v_z)
14 {
15 __m128 q_x = _mm_shuffle_ps(q, q, _MM_SHUFFLE(0, 0, 0, 0));
16 __m128 q_y = _mm_shuffle_ps(q, q, _MM_SHUFFLE(1, 1, 1, 1));
17 __m128 q_z = _mm_shuffle_ps(q, q, _MM_SHUFFLE(2, 2, 2, 2));
18 __m128 q_w = _mm_shuffle_ps(q, q, _MM_SHUFFLE(3, 3, 3, 3));
19 __m128 uv_x = _mm_fmsub_ps(q_y, v_z, _mm_mul_ps(v_y, q_z));
20 __m128 uv_y = _mm_fmsub_ps(q_z, v_x, _mm_mul_ps(v_z, q_x));
21 __m128 uv_z = _mm_fmsub_ps(q_x, v_y, _mm_mul_ps(v_x, q_y));
22 __m128 uuv_x = _mm_fmsub_ps(q_y, uv_z, _mm_mul_ps(uv_y, q_z));
23 __m128 uuv_y = _mm_fmsub_ps(q_z, uv_x, _mm_mul_ps(uv_z, q_x));
24 __m128 uuv_z = _mm_fmsub_ps(q_x, uv_y, _mm_mul_ps(uv_x, q_y));
25 __m128 t_x = _mm_fmadd_ps(q_w, uv_x, uuv_x);
26 __m128 t_y = _mm_fmadd_ps(q_w, uv_y, uuv_y);
27 __m128 t_z = _mm_fmadd_ps(q_w, uv_z, uuv_z);
28 out_x = _mm_add_ps(v_x, _mm_add_ps(t_x, t_x));
29 out_y = _mm_add_ps(v_y, _mm_add_ps(t_y, t_y));
30 out_z = _mm_add_ps(v_z, _mm_add_ps(t_z, t_z));
31 }
32
33 __forceinline __m128 atan_00155_fma_ps(__m128 x)
34 {
35 static const float signBit = -0.f;
36 static const __m128 C = _mm_setr_ps(float(3.14159265358979323846264338327950288 / 4.0),
37 0.2447f,
38 0.0663f,
39 1.f);
40 const auto c0 = _mm_shuffle_ps(C, C, _MM_SHUFFLE(0, 0, 0, 0));
41 const auto c1 = _mm_shuffle_ps(C, C, _MM_SHUFFLE(1, 1, 1, 1));
42 const auto c2 = _mm_shuffle_ps(C, C, _MM_SHUFFLE(2, 2, 2, 2));
43 const auto c3 = _mm_shuffle_ps(C, C, _MM_SHUFFLE(3, 3, 3, 3));
44 __m128 sign = _mm_load1_ps(&signBit);
45 __m128 abs_x = _mm_andnot_ps(sign, x);
46 __m128 t1 = _mm_sub_ps(abs_x, c3); // t1 = |x|-1
47 __m128 t3 = _mm_fmadd_ps(abs_x, c2, c1); // t3 = abs_x*c2 + c1
48 __m128 t2 = _mm_mul_ps(t1, t3);
49 __m128 t4 = _mm_mul_ps(x, t2); // r4 = x*t1*t3
50 __m128 t5 = _mm_fmsub_ps(c0, x, t4); // t5 = c0*x - t4
51 return t5;
52 }
53
54 __forceinline /*__declspec(noinline)*/ __m128 asin_fma_ps(__m128 x)
55 {
56 const __m128 A = _mm_setr_ps(1.5707288f, -0.2121144f, 0.0742610f, -0.0187293f);
57 __m128 C0_ps = _mm_shuffle_ps(A, A, _MM_SHUFFLE(0, 0, 0, 0));
58 __m128 C1_ps = _mm_shuffle_ps(A, A, _MM_SHUFFLE(1, 1, 1, 1));
59 __m128 C2_ps = _mm_shuffle_ps(A, A, _MM_SHUFFLE(2, 2, 2, 2));
60 __m128 C3_ps = _mm_shuffle_ps(A, A, _MM_SHUFFLE(3, 3, 3, 3));
61
62 const __m128 B = _mm_setr_ps(-0.f, 1.f, 1.5707963267948966f, 0.f);
63 __m128 sign_ps = _mm_shuffle_ps(B, B, _MM_SHUFFLE(0, 0, 0, 0));
64 __m128 one_ps = _mm_shuffle_ps(B, B, _MM_SHUFFLE(1, 1, 1, 1));
65 __m128 halfpi_ps = _mm_shuffle_ps(B, B, _MM_SHUFFLE(2, 2, 2, 2));
66
67 __m128 abs_x = _mm_andnot_ps(sign_ps, x);
68
69 __m128 a = _mm_sub_ps(one_ps, abs_x);
70 a = _mm_sqrt_ps(a);
71
72 __m128 b = C3_ps;
73 b = _mm_fmadd_ps(b, abs_x, C2_ps);
74 b = _mm_fmadd_ps(b, abs_x, C1_ps);
75 b = _mm_fmadd_ps(b, abs_x, C0_ps);
76
77 __m128 rv = _mm_fmsub_ps(a, b, halfpi_ps);
78 rv = _mm_andnot_ps(sign_ps, rv);
79 rv = _mm_or_ps(rv, _mm_and_ps(x, sign_ps));
80
81 return rv;
82 }
83
84#if 0 //static unittest is no good on systems which does not have avx2 support, but extensions does not have proper unittest support
85#pragma optimize( "", off )
86 static struct UnitTests
87 {
88 UnitTests()
89 {
90 const unsigned N = 1000;
91
92 for (unsigned i = 0; i < N; i++) {
93 __m128 x, y;
94 x.m128_f32[0] = (2.f / (N - 1))*i - 1.f;
95 y = atan_00155_fma_ps(x);
96 auto e = std::abs(atan(x.m128_f32[0]) - y.m128_f32[0]);
97 assert(e < 0.00155f);
98 }
99
100 for (unsigned i = 0; i < N; i++) {
101 __m128 x, y;
102 x.m128_f32[0] = (2.f / (N - 1))*i - 1.f;
103 y = asin_fma_ps(x);
104 auto e = std::abs(std::asin(x.m128_f32[0]) - y.m128_f32[0]);
105 assert(e < 7e-4f);
106 }
107
108 int a = 2;
109 }
110 } unitTests;
111#pragma optimize( "", on )
112#endif
113
114}
115
116namespace Cogs::Core::EchoSounder {
117
118 //#pragma optimize( "", off )
119 void sampleTile_inner_fma_gather(float * data,
120 const float *v,
121 const glm::vec3 /*tileIndex*/,
122 const glm::uvec3 /*tilePos*/,
123 const glm::uvec3 dataSize,
124 const glm::uvec3 maxIndices,
125 const glm::vec3 tp,
126 const glm::vec3 scale,
127 const glm::vec3 arrayPositionGlobal,
128 const glm::vec4* frustum,
129 const float minDistanceSquared,
130 const float maxDistanceSquared,
131 const glm::quat inverseOrientation,
132 const uint32_t coordSys,
133 const uint32_t minorCount,
134 const uint32_t sampleCount,
135 const glm::vec3 polarScale,
136 const glm::vec3 polarShift)
137 {
138 const __m128 rot_ = _mm_set_ps(inverseOrientation.w, inverseOrientation.z, inverseOrientation.y, inverseOrientation.x);
139 assert((dataSize.x & 3) == 0);
140 assert(coordSys == 1);
141
142 static const __m128 c0123 = _mm_setr_ps(0, 1, 2, 3);
143 static const __m128 one_ps = _mm_setr_ps(1.f, 1.f, 1.f, 1.f);
144 glm::vec3 ban = arrayPositionGlobal - tp;
145
146 glm::uvec3 maxIndicesLL = glm::max(maxIndices, glm::uvec3(1u)) - glm::uvec3(1);
147
148 for (uint32_t z = 0; z < dataSize.z; z++) {
149 __m128 pz = _mm_mul_ss(_mm_set1_ps(scale.z), _mm_set1_ps((float)z));
150 __m128 qz = _mm_sub_ss(pz, _mm_set1_ps(ban.z));
151 for (uint32_t y = 0; y < dataSize.y; y++) {
152 __m128 py = _mm_mul_ss(_mm_set1_ps(scale.y), _mm_set1_ps((float)y));
153 __m128 qy = _mm_sub_ss(py, _mm_set1_ps(ban.y));
154 __m128 r2_yz = _mm_add_ss(_mm_mul_ss(qy, qy), _mm_mul_ss(qz, qz));
155 __m128 in0_dot_yz = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[0].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[0].z), qz));
156 __m128 in1_dot_yz = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[1].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[1].z), qz));
157 __m128 in2_dot_yz = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[2].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[2].z), qz));
158 __m128 in3_dot_yz = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[3].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[3].z), qz));
159
160 in0_dot_yz = _mm_shuffle_ps(in0_dot_yz, in0_dot_yz, _MM_SHUFFLE(0, 0, 0, 0));
161 in1_dot_yz = _mm_shuffle_ps(in1_dot_yz, in1_dot_yz, _MM_SHUFFLE(0, 0, 0, 0));
162 in2_dot_yz = _mm_shuffle_ps(in2_dot_yz, in2_dot_yz, _MM_SHUFFLE(0, 0, 0, 0));
163 in3_dot_yz = _mm_shuffle_ps(in3_dot_yz, in3_dot_yz, _MM_SHUFFLE(0, 0, 0, 0));
164 r2_yz = _mm_shuffle_ps(r2_yz, r2_yz, _MM_SHUFFLE(0, 0, 0, 0));
165
166 for (uint32_t x = 0; x < dataSize.x; x += 4) {
167
168 // sample position relative to ping origin
169 __m128 i = _mm_add_ps(_mm_set1_ps((float)x), c0123);
170 __m128 qx = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(scale.x), i), _mm_set1_ps(ban.x));
171
172 // squared radius
173 __m128 r2 = _mm_add_ps(_mm_mul_ps(qx, qx), r2_yz);
174
175 // compare q against frustum planes
176
177
178 __m128 mask0 = _mm_cmple_ps(_mm_setzero_ps(), _mm_fmadd_ps(_mm_set1_ps(frustum[0].x), qx, in0_dot_yz));
179 __m128 mask1 = _mm_cmple_ps(_mm_setzero_ps(), _mm_fmadd_ps(_mm_set1_ps(frustum[1].x), qx, in1_dot_yz));
180 __m128 mask2 = _mm_cmple_ps(_mm_setzero_ps(), _mm_fmadd_ps(_mm_set1_ps(frustum[2].x), qx, in2_dot_yz));
181 __m128 mask3 = _mm_cmple_ps(_mm_setzero_ps(), _mm_fmadd_ps(_mm_set1_ps(frustum[3].x), qx, in3_dot_yz));
182
183 //__m128 mask0 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[0].x), qx), in0_dot_yz));
184 //__m128 mask1 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[1].x), qx), in1_dot_yz));
185 //__m128 mask2 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[2].x), qx), in2_dot_yz));
186 //__m128 mask3 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[3].x), qx), in3_dot_yz));
187 __m128 mask4 = _mm_cmple_ps(_mm_set1_ps(minDistanceSquared), r2);
188 __m128 mask5 = _mm_cmple_ps(r2, _mm_set1_ps(maxDistanceSquared));
189 __m128 mask = _mm_and_ps(_mm_and_ps(_mm_and_ps(mask0, mask1),
190 _mm_and_ps(mask2, mask3)),
191 _mm_and_ps(mask4, mask5));
192 //int movemask = _mm_movemask_ps(mask);
193 //if (movemask == 0) {
194 // assert(false);
195 // continue;
196 //}
197 // rotate into ping's orientation
198 __m128 ax, ay, az;
199 quat_times_fma_vec3_ps(ax, ay, az,
200 rot_,
201 qx, broadcast_ps<0>(qy), broadcast_ps<0>(qz));
202
203 __m128 r_inv = _mm_rsqrt_ps(r2);
204 __m128 r = _mm_rcp_ps(r_inv);
205
206 // dirX = asin(a.x/r)
207 // dirY = atan(y/z)
208 __m128 dirx = asin_fma_ps(_mm_mul_ps(ax, r_inv));
209 __m128 diry = atan_00155_fma_ps(_mm_mul_ps(ay, _mm_rcp_ps(az)));
210
211 // Figure out interpolation parameters
212 __m128 xi_i = _mm_max_ps(_mm_setzero_ps(), _mm_mul_ps(_mm_set1_ps(polarScale.x), _mm_sub_ps(diry, _mm_set1_ps(polarShift.x))));
213 __m128 xi_j = _mm_max_ps(_mm_setzero_ps(), _mm_mul_ps(_mm_set1_ps(polarScale.y), _mm_sub_ps(dirx, _mm_set1_ps(polarShift.y))));
214 __m128 xi_k = _mm_max_ps(_mm_setzero_ps(), _mm_mul_ps(_mm_set1_ps(polarScale.z), _mm_sub_ps(r, _mm_set1_ps(polarShift.z))));
215 __m128 tau_i = _mm_floor_ps(xi_i);
216 __m128 tau_j = _mm_floor_ps(xi_j);
217 __m128 tau_k = _mm_floor_ps(xi_k);
218 __m128 t_i = _mm_sub_ps(xi_i, tau_i);
219 __m128 t_j = _mm_sub_ps(xi_j, tau_j);
220 __m128 t_k = _mm_sub_ps(xi_k, tau_k);
221
222 __m128 i_i = (_mm_min_ps(_mm_set1_ps((float)maxIndicesLL.x), tau_i));
223 __m128 i_j = (_mm_min_ps(_mm_set1_ps((float)maxIndicesLL.y), tau_j));
224 __m128 i_k = (_mm_min_ps(_mm_set1_ps((float)maxIndicesLL.z), tau_k));
225
226#if 0
227 __m128i i_i_ = _mm_cvtps_epi32(i_i);
228 __m128i i_j_ = _mm_cvtps_epi32(i_j);
229 __m128i i_k_ = _mm_cvtps_epi32(i_k);
230 __m128i ix00 = _mm_add_epi32(_mm_mullo_epi32(_mm_set1_epi32(sampleCount), _mm_add_epi32(_mm_mullo_epi32(_mm_set1_epi32(minorCount), i_j_), i_i_)), i_k_);
231#else
232 // Use floating point mul instead of int mul, which is faster (and we the dataset is so small that float mantissa is sufficient).
233 __m128 ix00_ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps((float)sampleCount), _mm_add_ps(_mm_mul_ps(_mm_set1_ps((float)minorCount), i_j), i_i)), i_k);
234 __m128i ix00 = _mm_cvtps_epi32(ix00_);
235#endif
236
237#if 1
238 // Assume gather is slow
239 __m128 val00 = _mm_setr_ps(v[ix00.m128i_u32[0]],
240 v[ix00.m128i_u32[1]],
241 v[ix00.m128i_u32[2]],
242 v[ix00.m128i_u32[3]]);
243
244 __m128 val01 = _mm_setr_ps(v[ix00.m128i_u32[0] + minorCount],
245 v[ix00.m128i_u32[1] + minorCount],
246 v[ix00.m128i_u32[2] + minorCount],
247 v[ix00.m128i_u32[3] + minorCount]);
248
249 __m128 val10 = _mm_setr_ps(v[ix00.m128i_u32[0] + sampleCount],
250 v[ix00.m128i_u32[1] + sampleCount],
251 v[ix00.m128i_u32[2] + sampleCount],
252 v[ix00.m128i_u32[3] + sampleCount]);
253
254 __m128 val11 = _mm_setr_ps(v[ix00.m128i_u32[0] + minorCount + sampleCount],
255 v[ix00.m128i_u32[1] + minorCount + sampleCount],
256 v[ix00.m128i_u32[2] + minorCount + sampleCount],
257 v[ix00.m128i_u32[3] + minorCount + sampleCount]);
258#elif 1
259 // Assume gather is fast
260 __m128i ix01 = _mm_add_epi32(ix00, _mm_set1_epi32(minorCount));
261 __m128i ix10 = _mm_add_epi32(ix00, _mm_set1_epi32(sampleCount));
262 __m128i ix11 = _mm_add_epi32(ix00, _mm_set1_epi32(minorCount + sampleCount));
263
264 __m128 val00 = _mm_i32gather_ps(v, ix00, 4);
265 __m128 val01 = _mm_i32gather_ps(v, ix01, 4);
266 __m128 val10 = _mm_i32gather_ps(v, ix10, 4);
267 __m128 val11 = _mm_i32gather_ps(v, ix11, 4);
268#endif
269
270 //__m128 val0 = _mm_add_ps(_mm_mul_ps(s_j, val00), _mm_mul_ps(t_j, val01));
271 __m128 dif0 = _mm_sub_ps(val01, val00);
272 __m128 val0 = _mm_fmadd_ps(t_j, dif0, val00);
273
274 //__m128 val1 = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(one_ps, t_j), val10), _mm_mul_ps(t_j, val11));
275 __m128 dif1 = _mm_sub_ps(val11, val10);
276 __m128 val1 = _mm_fmadd_ps(t_j, dif1, val10);
277
278 //__m128 val_ = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(one_ps, t_i), val0), _mm_mul_ps(t_i, val1));
279 __m128 dif = _mm_sub_ps(val1, val0);
280 __m128 val_ = _mm_fmadd_ps(t_i, dif, val0);
281
282 uint32_t index = z * dataSize.y*dataSize.x + y * dataSize.x + x;
283 _mm_store_ps(data + index, val_);
284
285 }
286 }
287 }
288 }
289#pragma optimize( "", on )
290
291}
292