Cogs.Core: Extensions/EchoSounder/Source/Tasks/SampleVolumeTask

#if 0
#include "SampleVolumeTask.h"
 
using namespace Cogs::Core;
 
using glm::uvec3;
using glm::ivec3;
using glm::vec3;
using glm::clamp;
using glm::max;
using glm::min;
using glm::floor;
using glm::ceil;
using glm::inverse;
 
namespace {
  static const float piTwo = 1.5707963267948966f;
  static const float pi = 3.1415926535897931f;
  static const float signBit = -0.f;
  static const float one = 1.f;
 
  static const float asin_deg3_C3 = -0.0187293f;
  static const float asin_deg3_C2 = 0.0742610f;
  static const float asin_deg3_C1 = -0.2121144f;
  static const float asin_deg3_C0 = 1.5707288f;
  static const __m128 asin_deg3_C = _mm_set_ps(asin_deg3_C3, asin_deg3_C2, asin_deg3_C1, asin_deg3_C0);
 
  static const float asin_deg7_C7 = -0.0012624911f;
  static const float asin_deg7_C6 = 0.0066700901f;
  static const float asin_deg7_C5 = -0.0170881256f;
  static const float asin_deg7_C4 = 0.0308918810f;
  static const float asin_deg7_C3 = -0.0501743046f;
  static const float asin_deg7_C2 = 0.0889789874f;
  static const float asin_deg7_C1 = -0.2145988016f;
  static const float asin_deg7_C0 = 1.5707963050f;
 
 
  template<int lane> inline __m128 broadcast_ps(__m128 x) {
    return _mm_shuffle_ps(x, x, _MM_SHUFFLE(lane, lane, lane, lane));
  }
 
  __m128 asin_ps(__m128 x)
  {
#if 0
    // Reference
    __m128 rv;
    for (int i = 0; i < 4; i++) {
      rv.m128_f32[i] = std::asin(x.m128_f32[i]);
    }
    return rv;
#endif
 
    __m128 sign = _mm_load1_ps(&signBit);
    __m128 abs_x = _mm_andnot_ps(sign, x);
 
#if 1
    // Max error < 2e-5
    __m128 C = _mm_load_ps((float*)(&asin_deg3_C));
    __m128 r = _mm_mul_ps(_mm_shuffle_ps(C, C, _MM_SHUFFLE(3, 3, 3, 3)), abs_x);
    r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(C, C, _MM_SHUFFLE(2, 2, 2, 2)), abs_x));
    r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(C, C, _MM_SHUFFLE(1, 1, 1, 1)), abs_x));
    r = _mm_add_ps(r, _mm_shuffle_ps(C, C, _MM_SHUFFLE(0, 0, 0, 0)));
#elif 1
    // Max error < 2e-8
    __m128 r = _mm_mul_ps(_mm_load1_ps(&asin_deg7_C7), abs_x);
    r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C6), abs_x));
    r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C5), abs_x));
    r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C4), abs_x));
    r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C3), abs_x));
    r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C2), abs_x));
    r = _mm_add_ps(r, _mm_mul_ps(_mm_load1_ps(&asin_deg7_C1), abs_x));
    r = _mm_add_ps(r, _mm_load1_ps(&asin_deg7_C0));
#endif
 
    __m128 q = _mm_sub_ps(_mm_load1_ps(&one), abs_x);
#if 0
    q = _mm_sqrt_ps(q);
#else
    q = _mm_rcp_ps(_mm_rsqrt_ps(q));
#endif
 
    r = _mm_sub_ps(_mm_load1_ps(&piTwo), _mm_mul_ps(q, r));
 
    // copy sign from x
    return _mm_or_ps(r, _mm_and_ps(x, sign));
  }
 
  void quat_times_vec3_ps(__m128& out_x, __m128& out_y, __m128& out_z,
                          const __m128& q,
                          const __m128& v_x, const __m128& v_y, const __m128& v_z)
  {
#if 0
    // Reference
    glm::quat rot;
    rot.x = q.m128_f32[0];
    rot.y = q.m128_f32[1];
    rot.z = q.m128_f32[2];
    rot.w = q.m128_f32[3];
    for (int i = 0; i < 4; i++) {
      glm::vec3 v;
      v.x = v_x.m128_f32[i];
      v.y = v_y.m128_f32[i];
      v.z = v_z.m128_f32[i];
 
      glm::vec3 w = rot*v;
      out_x.m128_f32[i] = w.x;
      out_y.m128_f32[i] = w.y;
      out_z.m128_f32[i] = w.z;
    }
    return;
#endif
 
    __m128 q_x = _mm_shuffle_ps(q, q, _MM_SHUFFLE(0, 0, 0, 0));
    __m128 q_y = _mm_shuffle_ps(q, q, _MM_SHUFFLE(1, 1, 1, 1));
    __m128 q_z = _mm_shuffle_ps(q, q, _MM_SHUFFLE(2, 2, 2, 2));
    __m128 q_w = _mm_shuffle_ps(q, q, _MM_SHUFFLE(3, 3, 3, 3));
 
    __m128 uv_x = _mm_sub_ps(_mm_mul_ps(q_y, v_z), _mm_mul_ps(v_y, q_z));
    __m128 uv_y = _mm_sub_ps(_mm_mul_ps(q_z, v_x), _mm_mul_ps(v_z, q_x));
    __m128 uv_z = _mm_sub_ps(_mm_mul_ps(q_x, v_y), _mm_mul_ps(v_x, q_y));
    __m128 uuv_x = _mm_sub_ps(_mm_mul_ps(q_y, uv_z), _mm_mul_ps(uv_y, q_z));
    __m128 uuv_y = _mm_sub_ps(_mm_mul_ps(q_z, uv_x), _mm_mul_ps(uv_z, q_x));
    __m128 uuv_z = _mm_sub_ps(_mm_mul_ps(q_x, uv_y), _mm_mul_ps(uv_x, q_y));
    __m128 t_x = _mm_add_ps(_mm_mul_ps(q_w, uv_x), uuv_x);
    __m128 t_y = _mm_add_ps(_mm_mul_ps(q_w, uv_y), uuv_y);
    __m128 t_z = _mm_add_ps(_mm_mul_ps(q_w, uv_z), uuv_z);
    out_x = _mm_add_ps(v_x, _mm_add_ps(t_x, t_x));
    out_y = _mm_add_ps(v_y, _mm_add_ps(t_y, t_y));
    out_z = _mm_add_ps(v_z, _mm_add_ps(t_z, t_z));
  }
 
}
 
 
void EchoSounder::SampleVolumeTask2::runSSE4_1(TileValue * dstValues,
                                               uint16_t * timeValues,
                                               const EchoSounder::PingInfo* pinf,
                                               const uint32_t upperFansToRemove,
                                               const uint32_t numPings,
                                               const uint32_t numSamples,
                                               const uint32_t gridSizeX,
                                               const uint32_t gridSizeY,
                                               const float depthOffset,
                                               const float depthStep,
                                               const float sampleSpacing,
                                               const float decay,
                                               const std::vector<float>& beamAngleAlongship,
                                               const std::vector<float>& beamAngleAthwartship,
                                               const glm::ivec3& this_minIndex,
                                               const glm::ivec3& this_maxIndex)
{
  const auto time = pinf->time;
  const auto weight = 1.f;// / data.numPings;
  const auto numBeamsMajor = static_cast<uint32_t>(beamAngleAlongship.size());
  const auto numBeamsMinor = static_cast<uint32_t>(beamAngleAthwartship.size());
  const auto s = 1.f / sampleSpacing;
  //const auto width = data.cellWidth;
 
  const vec3 minPolar(beamAngleAthwartship.front(),
                      beamAngleAlongship.front(),
                      depthOffset);
  const vec3 maxPolar(beamAngleAthwartship.back(),
                      beamAngleAlongship.back(),
                      depthOffset + (numSamples - 1)*depthStep);
 
 
  const uvec3 maxTau(max(2u, numBeamsMinor) - 2,
                     max(2u + upperFansToRemove, numBeamsMajor) - 2 - upperFansToRemove,
                     max(2u, numSamples) - 2);
 
  // map minPolar to 0 and maxPolar to N-1.
  const vec3 polarToIndex = vec3(max(2u, numBeamsMinor) - 2,
                                 max(2u, numBeamsMajor) - 2,
                                 max(2u, numSamples) - 2) / (maxPolar - minPolar);
 
  const auto * minorMu = beamAngleAthwartship.data();
  const auto * majorMu = beamAngleAlongship.data();
 
  for (uint32_t p = 0; p < numPings; p++) {
 
    const auto minIndex = this_minIndex;// max(this_minIndex, ivec3(floor(s * pinf[p].boundingBoxTile[0])));
    const auto maxIndex = this_maxIndex;// min(this_maxIndex, ivec3(ceil(s * pinf[p].boundingBoxTile[1])));
    const auto * srcValues = pinf[p].field;
 
    const auto rot = inverse(pinf[p].metaPing.arrayOrientationGlobal);
    const __m128 rot_ = _mm_set_ps(rot.w, rot.z, rot.y, rot.x);
    const auto shift = pinf[p].arrayPositionTile;
 
    const __m128 n_x = _mm_set_ps(
      pinf[p].boundingFrustumNormals[3].x,
      pinf[p].boundingFrustumNormals[2].x,
      pinf[p].boundingFrustumNormals[1].x,
      pinf[p].boundingFrustumNormals[0].x
    );
    for (int k = minIndex.z; k < maxIndex.z; k++) {
 
      float samplePosGrid_z = sampleSpacing*k;
      if (samplePosGrid_z < pinf[p].depthRestriction) continue;
 
      for (int j = minIndex.y; j < maxIndex.y; j++) {
        float samplePosGrid_y = sampleSpacing*j;
 
        const auto l_z = sampleSpacing*k - shift.z;
        const auto l_y = sampleSpacing*j - shift.y;
        const __m128 eq_a = _mm_set_ps(
          pinf[p].boundingFrustumNormals[3].z * l_z + pinf[p].boundingFrustumNormals[3].y*l_y,
          pinf[p].boundingFrustumNormals[2].z * l_z + pinf[p].boundingFrustumNormals[2].y*l_y,
          pinf[p].boundingFrustumNormals[1].z * l_z + pinf[p].boundingFrustumNormals[1].y*l_y,
          pinf[p].boundingFrustumNormals[0].z * l_z + pinf[p].boundingFrustumNormals[0].y*l_y
        );
        const auto l_zy_2 = l_z*l_z + l_y*l_y;
 
        for (int i = minIndex.x; i < maxIndex.x; i+=4) {
 
          __m128 ii = _mm_set_ps(static_cast<float>(i + 3),
                                 static_cast<float>(i + 2),
                                 static_cast<float>(i + 1),
                                 static_cast<float>(i + 0));
          __m128 mask = _mm_cmple_ps(ii, _mm_set1_ps(static_cast<float>(maxIndex.x)));
          __m128 l_x = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(sampleSpacing), ii), _mm_set1_ps(shift.x));
          __m128 r2_ = _mm_add_ps(_mm_set1_ps(l_zy_2), _mm_mul_ps(l_x, l_x));
 
          // Early exit: check against frustum planes
          __m128 mask_0 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(broadcast_ps<0>(eq_a), _mm_mul_ps(broadcast_ps<0>(n_x), l_x)));
          __m128 mask_1 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(broadcast_ps<1>(eq_a), _mm_mul_ps(broadcast_ps<1>(n_x), l_x)));
          __m128 mask_2 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(broadcast_ps<2>(eq_a), _mm_mul_ps(broadcast_ps<2>(n_x), l_x)));
          __m128 mask_3 = _mm_cmple_ps(_mm_setzero_ps(), _mm_add_ps(broadcast_ps<3>(eq_a), _mm_mul_ps(broadcast_ps<3>(n_x), l_x)));
          __m128 mask_4 = _mm_cmple_ps(_mm_set1_ps(pinf[p].minDepthSquared), r2_);
          __m128 mask_5 = _mm_cmple_ps(r2_, _mm_set1_ps(pinf[p].maxDepthSquared));
          mask = _mm_and_ps(_mm_and_ps(_mm_and_ps(mask_0, mask_1),
                                       _mm_and_ps(mask_2, mask_3)),
                            _mm_and_ps(_mm_and_ps(mask_4, mask_5),
                                       mask));
          if (_mm_movemask_ps(mask) == 0) {
            continue;
          }
 
          __m128 cartesian_x, cartesian_y, cartesian_z;
          quat_times_vec3_ps(cartesian_x, cartesian_y, cartesian_z,
                             rot_,
                             l_x, _mm_set1_ps(l_y), _mm_set1_ps(l_z));
          __m128 rep_r = _mm_rsqrt_ps(r2_);
          __m128 r_ = _mm_rcp_ps(rep_r);
 
 
          float samplePosGrid_x = sampleSpacing*i;
          const auto samplePosArray = (glm::vec3(samplePosGrid_x, samplePosGrid_y, samplePosGrid_z) - shift);
          const auto r2 = glm::dot(samplePosArray, samplePosArray);
          const auto cartesianPos = rot * samplePosArray;
          float r = std::sqrt(r2);
 
          __m128 dirX = asin_ps(_mm_mul_ps(cartesian_x, rep_r)); //std::asin(cartesianPos.x / r);
          __m128 dirY = asin_ps(_mm_mul_ps(cartesian_y, rep_r)); //std::asin(cartesianPos.y / r);
 
          // calculate xi (source indices with fractional part).
          __m128 xi_x = _mm_mul_ps(_mm_set1_ps(polarToIndex.x),
                                   _mm_sub_ps(dirY, _mm_set1_ps(minPolar.x)));
          __m128 xi_y = _mm_mul_ps(_mm_set1_ps(polarToIndex.y),
                                   _mm_sub_ps(dirX, _mm_set1_ps(minPolar.y)));
          __m128 xi_z = _mm_mul_ps(_mm_set1_ps(polarToIndex.z),
                                   _mm_sub_ps(r_, _mm_set1_ps(minPolar.z)));
 
          __m128 tau_x = _mm_floor_ps(xi_x);  // floor is SSE4.1
          __m128 tau_y = _mm_floor_ps(xi_y);
          __m128 tau_z = _mm_floor_ps(xi_z);
 
          __m128i tau_x_vui = _mm_cvtps_epi32(tau_x);
          __m128i tau_y_vui = _mm_cvtps_epi32(tau_y);
          __m128i tau_z_vui = _mm_cvtps_epi32(tau_z);
          __m128 bx_ = _mm_sub_ps(xi_x, tau_x);
          __m128 by_ = _mm_sub_ps(xi_y, tau_y);
 
          auto upper_x = _mm_set1_epi32(maxTau.x);
          auto inrange_x = _mm_cmpeq_epi32(_mm_max_epu32(tau_x_vui, upper_x), upper_x);
 
          auto upper_y = _mm_set1_epi32(maxTau.y);
          auto inrange_y = _mm_cmpeq_epi32(_mm_max_epu32(tau_y_vui, upper_y), upper_y);
 
          auto upper_z = _mm_set1_epi32(maxTau.z);
          auto inrange_z = _mm_cmpeq_epi32(_mm_max_epu32(tau_z_vui, upper_z), upper_z);
 
          mask = _mm_and_ps(_mm_and_ps(mask, _mm_castsi128_ps(inrange_x)),
                            _mm_and_ps(_mm_castsi128_ps(inrange_y), _mm_castsi128_ps(inrange_z)));
 
          int movemask = _mm_movemask_ps(mask);
          if (movemask == 0) continue;
 
          for (uint32_t lane = 0; lane < 4; lane++) {
            if (((movemask >> lane) & 1) == 0) continue;
 
            uint32_t ix_x = tau_x_vui.m128i_u32[lane];
            uint32_t ix_y = tau_y_vui.m128i_u32[lane];
            uint32_t ix_z = tau_z_vui.m128i_u32[lane];
 
            // Bi-linear interpolation, nearest nb along depth.
            float bx = bx_.m128_f32[lane];//xi.x - tau.x;
            float by = by_.m128_f32[lane];//xi.y - tau.y;
 
            const auto val00 = srcValues[((ix_y + 0)*numBeamsMinor + (ix_x + 0))*numSamples + ix_z];
            const auto val01 = srcValues[((ix_y + 1)*numBeamsMinor + (ix_x + 0))*numSamples + ix_z];
            const auto val0 = (1.f - by)*PingValue_Decode(val00) + by*PingValue_Decode(val01);
 
            const auto val10 = srcValues[((ix_y + 0)*numBeamsMinor + (ix_x + 1))*numSamples + ix_z];
            const auto val11 = srcValues[((ix_y + 1)*numBeamsMinor + (ix_x + 1))*numSamples + ix_z];
            const auto val1 = (1.f - by)*PingValue_Decode(val10) + by*PingValue_Decode(val11);
 
            //const auto val = 10000000.f;//val11;// (1.f - bx)*val0 + bx*val1;
            const auto val = (1.f - bx)*val0 + bx*val1;
 
            auto dstOffset = (k*gridSizeY + j)*gridSizeX + i + lane;
 
 
            auto oldValue = TileValue_Decode(dstValues[dstOffset]);
            if (oldValue == 0.0) oldValue = val;
 
            dstValues[dstOffset] = TileValue_Encode(0.5f*oldValue + 0.5f*val);
            //dstValues[dstOffset] = max(oldValue, val);
            timeValues[dstOffset] = time;
          }
        }
      }
    }
  }
}
#endif