2#include <glm/gtc/type_ptr.hpp>
8 void transformVertex3ToVertex4SSE1(uint8_t* dst,
const size_t dst_stride,
const size_t ,
9 const glm::mat4& matrix,
10 const uint8_t* src,
const size_t src_stride,
const size_t ,
const size_t src_count)
22 __m128 col0 = _mm_loadu_ps(glm::value_ptr(matrix) + 0);
23 __m128 col1 = _mm_loadu_ps(glm::value_ptr(matrix) + 4);
24 __m128 col2 = _mm_loadu_ps(glm::value_ptr(matrix) + 8);
25 __m128 col3 = _mm_loadu_ps(glm::value_ptr(matrix) + 12);
28 __m128 p0 = _mm_loadu_ps((
float*)src); src += src_stride;
29 for (
size_t i = 0; i + 2 < src_count; i++) {
34 __m128 t00 = _mm_mul_ps(col0, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(0, 0, 0, 0)));
35 __m128 t01 = _mm_mul_ps(col1, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(1, 1, 1, 1)));
36 __m128 t02 = _mm_add_ps(t00, t01);
37 __m128 t03 = _mm_mul_ps(col2, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(2, 2, 2, 2)));
38 p0 = _mm_loadu_ps((
float*)src); src += src_stride;
39 __m128 t04 = _mm_add_ps(t03, col3);
40 __m128 t05 = _mm_add_ps(t02, t04);
41 _mm_storeu_ps((
float*)dst, t05); dst += dst_stride;
43 __m128 t00 = _mm_mul_ps(col0, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(0, 0, 0, 0)));
44 __m128 t01 = _mm_mul_ps(col1, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(1, 1, 1, 1)));
45 __m128 t02 = _mm_add_ps(t00, t01);
46 __m128 t03 = _mm_mul_ps(col2, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(2, 2, 2, 2)));
47 __m128 t04 = _mm_add_ps(t03, col3);
48 __m128 t05 = _mm_add_ps(t02, t04);
49 _mm_storeu_ps((
float*)dst, t05); dst += dst_stride;
54 __m128 e0 = _mm_load_ss((
float*)src); src +=
sizeof(float);
55 __m128 t0 = _mm_mul_ps(col0, _mm_shuffle_ps(e0, e0, _MM_SHUFFLE(0, 0, 0, 0)));
57 __m128 e1 = _mm_load_ss((
float*)src); src +=
sizeof(float);
58 __m128 t1 = _mm_mul_ps(col1, _mm_shuffle_ps(e1, e1, _MM_SHUFFLE(0, 0, 0, 0)));
59 __m128 t2 = _mm_add_ps(t0, t1);
61 __m128 e2 = _mm_load_ss((
float*)src);
62 __m128 t3 = _mm_mul_ps(col2, _mm_shuffle_ps(e2, e2, _MM_SHUFFLE(0, 0, 0, 0)));
63 __m128 t4 = _mm_add_ps(t2, col3);
64 __m128 t5 = _mm_add_ps(t3, t4);
66 _mm_storeu_ps((
float*)dst, t5); dst += dst_stride;
69 void transformVertex3ToVertex4FastPathSSE1(uint8_t* dst,
const size_t dst_stride,
const size_t ,
70 const glm::mat4& matrix,
71 const uint8_t* src,
const size_t src_stride,
const size_t ,
const size_t src_count)
84 __m128 col0 = _mm_loadu_ps(glm::value_ptr(matrix) + 0);
85 __m128 col1 = _mm_loadu_ps(glm::value_ptr(matrix) + 4);
86 __m128 col2 = _mm_loadu_ps(glm::value_ptr(matrix) + 8);
87 __m128 col3 = _mm_loadu_ps(glm::value_ptr(matrix) + 12);
90 __m128 p0 = _mm_load_ps((
float*)src); src += src_stride;
91 for (
size_t i = 0; i + 2 < src_count; i++) {
96 __m128 t00 = _mm_mul_ps(col0, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(0, 0, 0, 0)));
97 __m128 t01 = _mm_mul_ps(col1, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(1, 1, 1, 1)));
98 __m128 t02 = _mm_add_ps(t00, t01);
99 __m128 t03 = _mm_mul_ps(col2, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(2, 2, 2, 2)));
100 p0 = _mm_load_ps((
float*)src); src += src_stride;
101 __m128 t04 = _mm_add_ps(t03, col3);
102 __m128 t05 = _mm_add_ps(t02, t04);
103 _mm_stream_ps((
float*)dst, t05); dst += dst_stride;
105 __m128 t00 = _mm_mul_ps(col0, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(0, 0, 0, 0)));
106 __m128 t01 = _mm_mul_ps(col1, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(1, 1, 1, 1)));
107 __m128 t02 = _mm_add_ps(t00, t01);
108 __m128 t03 = _mm_mul_ps(col2, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(2, 2, 2, 2)));
109 __m128 t04 = _mm_add_ps(t03, col3);
110 __m128 t05 = _mm_add_ps(t02, t04);
111 _mm_stream_ps((
float*)dst, t05); dst += dst_stride;
116 __m128 e0 = _mm_load_ss((
float*)src); src +=
sizeof(float);
117 __m128 t0 = _mm_mul_ps(col0, _mm_shuffle_ps(e0, e0, _MM_SHUFFLE(0, 0, 0, 0)));
119 __m128 e1 = _mm_load_ss((
float*)src); src +=
sizeof(float);
120 __m128 t1 = _mm_mul_ps(col1, _mm_shuffle_ps(e1, e1, _MM_SHUFFLE(0, 0, 0, 0)));
121 __m128 t2 = _mm_add_ps(t0, t1);
123 __m128 e2 = _mm_load_ss((
float*)src);
124 __m128 t3 = _mm_mul_ps(col2, _mm_shuffle_ps(e2, e2, _MM_SHUFFLE(0, 0, 0, 0)));
125 __m128 t4 = _mm_add_ps(t2, col3);
126 __m128 t5 = _mm_add_ps(t3, t4);
128 _mm_stream_ps((
float*)dst, t5); dst += dst_stride;
Contains the Engine, Renderer, resource managers and other systems needed to run Cogs....