3#include <glm/gtc/quaternion.hpp>
11 __forceinline
void quat_times_vec3_ps(__m256& out_x, __m256& out_y, __m256& out_z,
13 const __m256& v_x,
const __m256& v_y,
const __m256& v_z)
15 __m256 q_x = _mm256_set1_ps(q.x);
16 __m256 q_y = _mm256_set1_ps(q.y);
17 __m256 q_z = _mm256_set1_ps(q.z);
18 __m256 q_w = _mm256_set1_ps(q.w);
19 __m256 uv_x = _mm256_fmsub_ps(q_y, v_z, _mm256_mul_ps(v_y, q_z));
20 __m256 uv_y = _mm256_fmsub_ps(q_z, v_x, _mm256_mul_ps(v_z, q_x));
21 __m256 uv_z = _mm256_fmsub_ps(q_x, v_y, _mm256_mul_ps(v_x, q_y));
22 __m256 uuv_x = _mm256_fmsub_ps(q_y, uv_z, _mm256_mul_ps(uv_y, q_z));
23 __m256 uuv_y = _mm256_fmsub_ps(q_z, uv_x, _mm256_mul_ps(uv_z, q_x));
24 __m256 uuv_z = _mm256_fmsub_ps(q_x, uv_y, _mm256_mul_ps(uv_x, q_y));
25 __m256 t_x = _mm256_fmadd_ps(q_w, uv_x, uuv_x);
26 __m256 t_y = _mm256_fmadd_ps(q_w, uv_y, uuv_y);
27 __m256 t_z = _mm256_fmadd_ps(q_w, uv_z, uuv_z);
28 out_x = _mm256_add_ps(v_x, _mm256_add_ps(t_x, t_x));
29 out_y = _mm256_add_ps(v_y, _mm256_add_ps(t_y, t_y));
30 out_z = _mm256_add_ps(v_z, _mm256_add_ps(t_z, t_z));
33 __forceinline __m256 atan_00155_ps(__m256 x)
35 static const float signBit = -0.f;
36 const auto c0 = _mm256_set1_ps(
float(3.14159265358979323846264338327950288 / 4.0));
37 const auto c1 = _mm256_set1_ps(0.2447f);
38 const auto c2 = _mm256_set1_ps(0.0663f);
39 const auto c3 = _mm256_set1_ps(1.f);
40 __m256 sign = _mm256_set1_ps(-0.f);
41 __m256 abs_x = _mm256_andnot_ps(sign, x);
42 __m256 t1 = _mm256_sub_ps(abs_x, c3);
43 __m256 t3 = _mm256_fmadd_ps(abs_x, c2, c1);
44 __m256 t2 = _mm256_mul_ps(t1, t3);
45 __m256 t4 = _mm256_mul_ps(x, t2);
46 __m256 t5 = _mm256_fmsub_ps(c0, x, t4);
50 __forceinline __m256 asin_ps(__m256 x)
52 const auto C0_ps = _mm256_set1_ps(1.5707288f);
53 const auto C1_ps = _mm256_set1_ps(-0.2121144f);
54 const auto C2_ps = _mm256_set1_ps(0.0742610f);
55 const auto C3_ps = _mm256_set1_ps(-0.0187293f);
56 const auto sign_ps = _mm256_set1_ps(-0.f);
57 const auto one_ps = _mm256_set1_ps(1.f);
58 const auto halfpi_ps = _mm256_set1_ps(1.5707963267948966f);
59 const auto special = _mm256_set1_ps(std::numeric_limits<float>::infinity());
61 __m256 abs_x = _mm256_andnot_ps(sign_ps, x);
63 __m256 a_ = _mm256_sub_ps(one_ps, abs_x);
65 __m256 a = _mm256_sqrt_ps(a_);
67 __m256 t = _mm256_rsqrt_ps(a_);
68 __m256 m = _mm256_cmp_ps(special, t, _CMP_NEQ_OQ);
69 __m256 a = _mm256_mul_ps(a_, _mm256_and_ps(m, t));
73 b = _mm256_fmadd_ps(b, abs_x, C2_ps);
74 b = _mm256_fmadd_ps(b, abs_x, C1_ps);
75 b = _mm256_fmadd_ps(b, abs_x, C0_ps);
77 __m256 rv = _mm256_fmsub_ps(a, b, halfpi_ps);
78 rv = _mm256_andnot_ps(sign_ps, rv);
79 rv = _mm256_or_ps(rv, _mm256_and_ps(x, sign_ps));
84#pragma optimize( "", off )
85 static struct UnitTests
89 const unsigned N = 1000;
91 for (
unsigned i = 0; i < N; i++) {
93 x = _mm256_set1_ps((2.f / (N - 1))*i - 1.f);
95 auto e = std::abs(atan(x.m256_f32[7]) - y.m256_f32[7]);
99 for (
unsigned i = 0; i < N; i++) {
101 x = _mm256_set1_ps((2.f / (N - 1))*i - 1.f);
103 auto e = std::abs(std::asin(x.m256_f32[7]) - y.m256_f32[7]);
110#pragma optimize( "", on )
116namespace Cogs::Core::EchoSounder {
119 glm::vec3 getSamplePosRef(
const glm::uvec3 dataSize,
121 const glm::vec3 scale,
122 const glm::vec3 arrayPositionGlobal,
123 const glm::vec4* frustum,
124 const float minDistanceSquared,
125 const float maxDistanceSquared,
126 const glm::quat inverseOrientation,
127 const uint32_t coordSys,
128 const glm::vec3 polarScale,
129 const glm::vec3 polarShift,
135 void sampleTile3_border_avx2(
float * data,
141 const glm::uvec3 dataSize,
142 const glm::uvec3 maxIndices,
144 const glm::vec3 scale,
145 const glm::vec3 arrayPositionGlobal,
146 const glm::vec4* frustum,
147 const float minDistanceSquared,
148 const float maxDistanceSquared,
149 const glm::quat inverseOrientation,
150 const uint32_t coordSys,
151 const uint32_t minorCount,
152 const uint32_t sampleCount,
153 const glm::vec3 polarScale,
154 const glm::vec3 polarShift)
156 const __m128 rot_ = _mm_set_ps(inverseOrientation.w, inverseOrientation.z, inverseOrientation.y, inverseOrientation.x);
157 assert((dataSize.x & 7) == 0);
158 assert(coordSys == 1);
160 static const __m256 c01234567 = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7);
161 static const __m128 one_ps = _mm_setr_ps(1.f, 1.f, 1.f, 1.f);
162 glm::vec3 ban = arrayPositionGlobal - tp;
164 glm::uvec3 maxIndicesLL = glm::max(maxIndices, glm::uvec3(1u)) - glm::uvec3(1);
166 __m256 minValue = _mm256_set1_ps(std::numeric_limits<float>::max());
167 __m256 maxValue = _mm256_set1_ps(-std::numeric_limits<float>::max());
168 for (uint32_t z = 0; z < dataSize.z; z++) {
169 __m128 pz = _mm_mul_ss(_mm_set1_ps(scale.z), _mm_set1_ps(
static_cast<float>(z)));
170 __m128 qz = _mm_sub_ss(pz, _mm_set1_ps(ban.z));
171 for (uint32_t y = 0; y < dataSize.y; y++) {
172 __m128 py = _mm_mul_ss(_mm_set1_ps(scale.y), _mm_set1_ps(
static_cast<float>(y)));
173 __m128 qy = _mm_sub_ss(py, _mm_set1_ps(ban.y));
174 __m128 r2_yz_ = _mm_add_ss(_mm_mul_ss(qy, qy), _mm_mul_ss(qz, qz));
175 __m128 in0_dot_yz_ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[0].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[0].z), qz));
176 __m128 in1_dot_yz_ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[1].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[1].z), qz));
177 __m128 in2_dot_yz_ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[2].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[2].z), qz));
178 __m128 in3_dot_yz_ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[3].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[3].z), qz));
180 __m256 in0_dot_yz = _mm256_broadcastss_ps(in0_dot_yz_);
181 __m256 in1_dot_yz = _mm256_broadcastss_ps(in1_dot_yz_);
182 __m256 in2_dot_yz = _mm256_broadcastss_ps(in2_dot_yz_);
183 __m256 in3_dot_yz = _mm256_broadcastss_ps(in3_dot_yz_);
184 __m256 r2_yz = _mm256_broadcastss_ps(r2_yz_);
186 for (uint32_t x = 0; x < dataSize.x; x += 8) {
189 __m256 i = _mm256_add_ps(_mm256_set1_ps(
static_cast<float>(x)), c01234567);
190 __m256 qx = _mm256_fmsub_ps(_mm256_set1_ps(scale.x), i, _mm256_set1_ps(ban.x));
191 __m256 r2 = _mm256_fmadd_ps(qx, qx, r2_yz);
194 __m256 mask0 = _mm256_cmp_ps(_mm256_setzero_ps(), _mm256_fmadd_ps(_mm256_set1_ps(frustum[0].x), qx, in0_dot_yz), _CMP_LE_OQ);
195 __m256 mask1 = _mm256_cmp_ps(_mm256_setzero_ps(), _mm256_fmadd_ps(_mm256_set1_ps(frustum[1].x), qx, in1_dot_yz), _CMP_LE_OQ);
196 __m256 mask2 = _mm256_cmp_ps(_mm256_setzero_ps(), _mm256_fmadd_ps(_mm256_set1_ps(frustum[2].x), qx, in2_dot_yz), _CMP_LE_OQ);
197 __m256 mask3 = _mm256_cmp_ps(_mm256_setzero_ps(), _mm256_fmadd_ps(_mm256_set1_ps(frustum[3].x), qx, in3_dot_yz), _CMP_LE_OQ);
198 __m256 mask4 = _mm256_cmp_ps(_mm256_set1_ps(minDistanceSquared), r2, _CMP_LE_OQ);
199 __m256 mask5 = _mm256_cmp_ps(r2, _mm256_set1_ps(maxDistanceSquared), _CMP_LE_OQ);
200 __m256 mask = _mm256_and_ps(_mm256_and_ps(_mm256_and_ps(mask0, mask1),
201 _mm256_and_ps(mask2, mask3)),
202 _mm256_and_ps(mask4, mask5));
203 int movemask = _mm256_movemask_ps(mask);
206 for (
unsigned lane = 0; lane < 8; lane++) {
207 glm::vec3 ref = getSamplePosRef(dataSize, tp, scale, arrayPositionGlobal, frustum, minDistanceSquared, maxDistanceSquared,
208 inverseOrientation, coordSys, polarScale, polarShift,
213 __m256 old = _mm256_load_ps(data);
214 minValue = _mm256_min_ps(minValue, old);
215 maxValue = _mm256_max_ps(maxValue, old);
221 quat_times_vec3_ps(ax, ay, az,
223 qx, _mm256_broadcastss_ps(qy), _mm256_broadcastss_ps(qz));
225 __m256 r_inv = _mm256_rsqrt_ps(r2);
227 __m256 r = _mm256_mul_ps(r2, r_inv);
229 __m256 dirx = asin_ps(_mm256_mul_ps(ax, r_inv));
230 __m256 diry = atan_00155_ps(_mm256_mul_ps(ay, _mm256_rcp_ps(az)));
231 __m256 xi_i = _mm256_max_ps(_mm256_setzero_ps(), _mm256_mul_ps(_mm256_set1_ps(polarScale.x), _mm256_sub_ps(diry, _mm256_set1_ps(polarShift.x))));
232 __m256 xi_j = _mm256_max_ps(_mm256_setzero_ps(), _mm256_mul_ps(_mm256_set1_ps(polarScale.y), _mm256_sub_ps(dirx, _mm256_set1_ps(polarShift.y))));
233 __m256 xi_k = _mm256_max_ps(_mm256_setzero_ps(), _mm256_mul_ps(_mm256_set1_ps(polarScale.z), _mm256_sub_ps(az, _mm256_set1_ps(polarShift.z))));
237 for (
unsigned lane = 0; lane < 8; lane++) {
238 if (mask.m256_f32[lane] == 0.0)
continue;
239 glm::vec3 ref = getSamplePosRef(dataSize, tp, scale, arrayPositionGlobal, frustum, minDistanceSquared, maxDistanceSquared,
240 inverseOrientation, coordSys, polarScale, polarShift,
242 if (ref.x < 0)
continue;
244 auto ex = std::abs(ref.x - xi_i.m256_f32[lane]);
245 auto ey = std::abs(ref.y - xi_j.m256_f32[lane]);
246 auto ez = std::abs(ref.z - xi_k.m256_f32[lane]);
253 __m256 tau_i = _mm256_floor_ps(xi_i);
254 __m256 tau_j = _mm256_floor_ps(xi_j);
255 __m256 tau_k = _mm256_floor_ps(xi_k);
256 __m256 t_i = _mm256_sub_ps(xi_i, tau_i);
257 __m256 t_j = _mm256_sub_ps(xi_j, tau_j);
258 __m256 t_k = _mm256_sub_ps(xi_k, tau_k);
260 __m256 i_i = _mm256_min_ps(_mm256_set1_ps(
static_cast<float>(maxIndicesLL.x)), tau_i);
261 __m256 i_j = _mm256_min_ps(_mm256_set1_ps(
static_cast<float>(maxIndicesLL.y)), tau_j);
262 __m256 i_k = _mm256_min_ps(_mm256_set1_ps(
static_cast<float>(maxIndicesLL.z)), tau_k);
264 __m256 ix00_ = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(
static_cast<float>(sampleCount)),
265 _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(
static_cast<float>(minorCount)), i_j), i_i)), i_k);
266 __m256i ix00 = _mm256_cvtps_epi32(ix00_);
268 __m256i ix01 = _mm256_add_epi32(ix00, _mm256_set1_epi32(minorCount));
269 __m256i ix10 = _mm256_add_epi32(ix00, _mm256_set1_epi32(sampleCount));
270 __m256i ix11 = _mm256_add_epi32(ix00, _mm256_set1_epi32(minorCount + sampleCount));
272 __m256 val00 = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), v, ix00, mask, 4);
273 __m256 val01 = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), v, ix01, mask, 4);
274 __m256 val10 = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), v, ix10, mask, 4);
275 __m256 val11 = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), v, ix11, mask, 4);
277 __m256 dif0 = _mm256_sub_ps(val01, val00);
278 __m256 val0 = _mm256_fmadd_ps(t_j, dif0, val00);
280 __m256 dif1 = _mm256_sub_ps(val11, val10);
281 __m256 val1 = _mm256_fmadd_ps(t_j, dif1, val10);
283 __m256 dif = _mm256_sub_ps(val1, val0);
284 __m256 val_ = _mm256_fmadd_ps(t_i, dif, val0);
287 __m256 old = _mm256_load_ps(data);
290 val_ = _mm256_blendv_ps(old, val_, mask);
292 val_ = _mm256_fmadd_ps(_mm256_set1_ps(0.5f), old, _mm256_mul_ps(_mm256_set1_ps(0.5f), val_));
294 minValue = _mm256_min_ps(minValue, val_);
295 maxValue = _mm256_max_ps(maxValue, val_);
297 _mm256_store_ps(data, val_);
307 minVal = glm::min(glm::min(glm::min(minValue.m256_f32[0],
308 minValue.m256_f32[1]),
309 glm::min(minValue.m256_f32[2],
310 minValue.m256_f32[3])),
311 glm::min(glm::min(minValue.m256_f32[4],
312 minValue.m256_f32[5]),
313 glm::min(minValue.m256_f32[6],
314 minValue.m256_f32[7])));
316 maxVal = glm::max(glm::max(glm::max(maxValue.m256_f32[0],
317 maxValue.m256_f32[1]),
318 glm::max(maxValue.m256_f32[2],
319 maxValue.m256_f32[3])),
320 glm::max(glm::max(maxValue.m256_f32[4],
321 maxValue.m256_f32[5]),
322 glm::max(maxValue.m256_f32[6],
323 maxValue.m256_f32[7])));
327 void sampleTile3_avx2(
float * data,
333 const glm::uvec3 dataSize,
334 const glm::uvec3 maxIndices,
336 const glm::vec3 scale,
337 const glm::vec3 arrayPositionGlobal,
338 const glm::vec4* frustum,
341 const glm::quat inverseOrientation,
342 const uint32_t coordSys,
343 const uint32_t minorCount,
344 const uint32_t sampleCount,
345 const glm::vec3 polarScale,
346 const glm::vec3 polarShift)
348 const __m128 rot_ = _mm_set_ps(inverseOrientation.w, inverseOrientation.z, inverseOrientation.y, inverseOrientation.x);
349 assert((dataSize.x & 7) == 0);
350 assert(coordSys == 1);
352 static const __m256 c01234567 = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7);
353 static const __m128 one_ps = _mm_setr_ps(1.f, 1.f, 1.f, 1.f);
354 glm::vec3 ban = arrayPositionGlobal - tp;
356 glm::uvec3 maxIndicesLL = glm::max(maxIndices, glm::uvec3(1u)) - glm::uvec3(1);
358 __m256 minValue = _mm256_set1_ps(std::numeric_limits<float>::max());
359 __m256 maxValue = _mm256_set1_ps(-std::numeric_limits<float>::max());
360 for (uint32_t z = 0; z < dataSize.z; z++) {
361 __m128 pz = _mm_mul_ss(_mm_set1_ps(scale.z), _mm_set1_ps(
static_cast<float>(z)));
362 __m128 qz = _mm_sub_ss(pz, _mm_set1_ps(ban.z));
363 for (uint32_t y = 0; y < dataSize.y; y++) {
364 __m128 py = _mm_mul_ss(_mm_set1_ps(scale.y), _mm_set1_ps(
static_cast<float>(y)));
365 __m128 qy = _mm_sub_ss(py, _mm_set1_ps(ban.y));
366 __m128 r2_yz_ = _mm_add_ss(_mm_mul_ss(qy, qy), _mm_mul_ss(qz, qz));
367 __m128 in0_dot_yz_ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[0].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[0].z), qz));
368 __m128 in1_dot_yz_ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[1].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[1].z), qz));
369 __m128 in2_dot_yz_ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[2].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[2].z), qz));
370 __m128 in3_dot_yz_ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(frustum[3].y), qy), _mm_mul_ps(_mm_set1_ps(frustum[3].z), qz));
372 __m256 in0_dot_yz = _mm256_broadcastss_ps(in0_dot_yz_);
373 __m256 in1_dot_yz = _mm256_broadcastss_ps(in1_dot_yz_);
374 __m256 in2_dot_yz = _mm256_broadcastss_ps(in2_dot_yz_);
375 __m256 in3_dot_yz = _mm256_broadcastss_ps(in3_dot_yz_);
376 __m256 r2_yz = _mm256_broadcastss_ps(r2_yz_);
378 for (uint32_t x = 0; x < dataSize.x; x += 8) {
381 __m256 i = _mm256_add_ps(_mm256_set1_ps(
static_cast<float>(x)), c01234567);
382 __m256 qx = _mm256_sub_ps(_mm256_mul_ps(_mm256_set1_ps(scale.x), i), _mm256_set1_ps(ban.x));
383 __m256 r2 = _mm256_add_ps(_mm256_mul_ps(qx, qx), r2_yz);
386 quat_times_vec3_ps(ax, ay, az,
388 qx, _mm256_broadcastss_ps(qy), _mm256_broadcastss_ps(qz));
390 __m256 r_inv = _mm256_rsqrt_ps(r2);
391 __m256 r = _mm256_rcp_ps(r_inv);
393 __m256 dirx = asin_ps(_mm256_mul_ps(ax, r_inv));
394 __m256 diry = atan_00155_ps(_mm256_mul_ps(ay, _mm256_rcp_ps(az)));
395 __m256 xi_i = _mm256_max_ps(_mm256_setzero_ps(), _mm256_mul_ps(_mm256_set1_ps(polarScale.x), _mm256_sub_ps(diry, _mm256_set1_ps(polarShift.x))));
396 __m256 xi_j = _mm256_max_ps(_mm256_setzero_ps(), _mm256_mul_ps(_mm256_set1_ps(polarScale.y), _mm256_sub_ps(dirx, _mm256_set1_ps(polarShift.y))));
398 __m256 xi_k = _mm256_max_ps(_mm256_setzero_ps(), _mm256_mul_ps(_mm256_set1_ps(polarScale.z), _mm256_sub_ps(az, _mm256_set1_ps(polarShift.z))));
401 for (
unsigned lane = 0; lane < 8; lane++) {
402 glm::vec3 ref = getSamplePosRef(dataSize, tp, scale, arrayPositionGlobal, frustum, minDistanceSquared, maxDistanceSquared,
403 inverseOrientation, coordSys, polarScale, polarShift,
405 auto ex = std::abs(ref.x - xi_i.m256_f32[lane]);
406 auto ey = std::abs(ref.y - xi_j.m256_f32[lane]);
407 auto ez = std::abs(ref.z - xi_k.m256_f32[lane]);
414 __m256 tau_i = _mm256_floor_ps(xi_i);
415 __m256 tau_j = _mm256_floor_ps(xi_j);
416 __m256 tau_k = _mm256_floor_ps(xi_k);
417 __m256 t_i = _mm256_sub_ps(xi_i, tau_i);
418 __m256 t_j = _mm256_sub_ps(xi_j, tau_j);
419 __m256 t_k = _mm256_sub_ps(xi_k, tau_k);
421 __m256 i_i = _mm256_min_ps(_mm256_set1_ps(
static_cast<float>(maxIndicesLL.x)), tau_i);
422 __m256 i_j = _mm256_min_ps(_mm256_set1_ps(
static_cast<float>(maxIndicesLL.y)), tau_j);
423 __m256 i_k = _mm256_min_ps(_mm256_set1_ps(
static_cast<float>(maxIndicesLL.z)), tau_k);
425 __m256 ix00_ = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(
static_cast<float>(sampleCount)),
426 _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(
static_cast<float>(minorCount)), i_j), i_i)), i_k);
427 __m256i ix00 = _mm256_cvtps_epi32(ix00_);
429 __m256i ix01 = _mm256_add_epi32(ix00, _mm256_set1_epi32(minorCount));
430 __m256i ix10 = _mm256_add_epi32(ix00, _mm256_set1_epi32(sampleCount));
431 __m256i ix11 = _mm256_add_epi32(ix00, _mm256_set1_epi32(minorCount + sampleCount));
433 __m256 val00 = _mm256_i32gather_ps(v, ix00, 4);
434 __m256 val01 = _mm256_i32gather_ps(v, ix01, 4);
435 __m256 val10 = _mm256_i32gather_ps(v, ix10, 4);
436 __m256 val11 = _mm256_i32gather_ps(v, ix11, 4);
438 __m256 dif0 = _mm256_sub_ps(val01, val00);
439 __m256 val0 = _mm256_fmadd_ps(t_j, dif0, val00);
441 __m256 dif1 = _mm256_sub_ps(val11, val10);
442 __m256 val1 = _mm256_fmadd_ps(t_j, dif1, val10);
444 __m256 dif = _mm256_sub_ps(val1, val0);
445 __m256 val_ = _mm256_fmadd_ps(t_i, dif, val0);
448 __m256 old = _mm256_loadu_ps(data);
449 val_ = _mm256_fmadd_ps(_mm256_set1_ps(0.5f), old, _mm256_mul_ps(_mm256_set1_ps(0.5f), val_));
451 minValue = _mm256_min_ps(minValue, old);
452 maxValue = _mm256_max_ps(maxValue, old);
454 _mm256_store_ps(data, val_);
462 minVal = glm::min(glm::min(glm::min(minValue.m256_f32[0],
463 minValue.m256_f32[1]),
464 glm::min(minValue.m256_f32[2],
465 minValue.m256_f32[3])),
466 glm::min(glm::min(minValue.m256_f32[4],
467 minValue.m256_f32[5]),
468 glm::min(minValue.m256_f32[6],
469 minValue.m256_f32[7])));
471 maxVal = glm::max(glm::max(glm::max(maxValue.m256_f32[0],
472 maxValue.m256_f32[1]),
473 glm::max(maxValue.m256_f32[2],
474 maxValue.m256_f32[3])),
475 glm::max(glm::max(maxValue.m256_f32[4],
476 maxValue.m256_f32[5]),
477 glm::max(maxValue.m256_f32[6],
478 maxValue.m256_f32[7])));