Cogs.Core
TransformVertices_sse.cpp
1#ifndef EMSCRIPTEN
2#include <glm/gtc/type_ptr.hpp>
3#include <xmmintrin.h>
4
5namespace Cogs::Core
6{
7
8 void transformVertex3ToVertex4SSE1(uint8_t* dst, const size_t dst_stride, const size_t /*dst_bytes*/,
9 const glm::mat4& matrix,
10 const uint8_t* src, const size_t src_stride, const size_t /*src_bytes*/, const size_t src_count)
11 {
12 if (src_count == 0) {
13 return;
14 }
15 // _mm_loadu_ps SSE1
16 // _mm_load_ss SSE1
17 // _mm_shuffle_ps SSE1
18 // _mm_mul_ps SSE1
19 // _mm_add_ps SSE1
20 // _mm_storeu_ps SSE1
21
22 __m128 col0 = _mm_loadu_ps(glm::value_ptr(matrix) + 0);
23 __m128 col1 = _mm_loadu_ps(glm::value_ptr(matrix) + 4);
24 __m128 col2 = _mm_loadu_ps(glm::value_ptr(matrix) + 8);
25 __m128 col3 = _mm_loadu_ps(glm::value_ptr(matrix) + 12);
26
27 if (1 < src_count) {
28 __m128 p0 = _mm_loadu_ps((float*)src); src += src_stride;
29 for (size_t i = 0; i + 2 < src_count; i++) {
30 // x_k = m_00 x_k + m_10 y_k + m_20 z_k + m_30
31 // y_k = m_01 x_k + m_11 y_k + m_21 z_k + m_31
32 // z_k = m_02 x_k + m_12 y_k + m_22 z_k + m_32
33 // w_k = m_03 x_k + m_13 y_k + m_23 z_k + m_33
34 __m128 t00 = _mm_mul_ps(col0, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(0, 0, 0, 0)));
35 __m128 t01 = _mm_mul_ps(col1, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(1, 1, 1, 1)));
36 __m128 t02 = _mm_add_ps(t00, t01);
37 __m128 t03 = _mm_mul_ps(col2, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(2, 2, 2, 2)));
38 p0 = _mm_loadu_ps((float*)src); src += src_stride;
39 __m128 t04 = _mm_add_ps(t03, col3);
40 __m128 t05 = _mm_add_ps(t02, t04);
41 _mm_storeu_ps((float*)dst, t05); dst += dst_stride;
42 }
43 __m128 t00 = _mm_mul_ps(col0, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(0, 0, 0, 0)));
44 __m128 t01 = _mm_mul_ps(col1, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(1, 1, 1, 1)));
45 __m128 t02 = _mm_add_ps(t00, t01);
46 __m128 t03 = _mm_mul_ps(col2, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(2, 2, 2, 2)));
47 __m128 t04 = _mm_add_ps(t03, col3);
48 __m128 t05 = _mm_add_ps(t02, t04);
49 _mm_storeu_ps((float*)dst, t05); dst += dst_stride;
50 }
51
52 // Last element, we use three single-scalar loads to avoid reading
53 // beyond the source buffer
54 __m128 e0 = _mm_load_ss((float*)src); src += sizeof(float);
55 __m128 t0 = _mm_mul_ps(col0, _mm_shuffle_ps(e0, e0, _MM_SHUFFLE(0, 0, 0, 0)));
56
57 __m128 e1 = _mm_load_ss((float*)src); src += sizeof(float);
58 __m128 t1 = _mm_mul_ps(col1, _mm_shuffle_ps(e1, e1, _MM_SHUFFLE(0, 0, 0, 0)));
59 __m128 t2 = _mm_add_ps(t0, t1);
60
61 __m128 e2 = _mm_load_ss((float*)src);
62 __m128 t3 = _mm_mul_ps(col2, _mm_shuffle_ps(e2, e2, _MM_SHUFFLE(0, 0, 0, 0)));
63 __m128 t4 = _mm_add_ps(t2, col3);
64 __m128 t5 = _mm_add_ps(t3, t4);
65
66 _mm_storeu_ps((float*)dst, t5); dst += dst_stride;
67 }
68
69 void transformVertex3ToVertex4FastPathSSE1(uint8_t* dst, const size_t dst_stride, const size_t /*dst_bytes*/,
70 const glm::mat4& matrix,
71 const uint8_t* src, const size_t src_stride, const size_t /*src_bytes*/, const size_t src_count)
72 {
73 // Assumes that dst is 16-byte aligned
74 if (src_count == 0) {
75 return;
76 }
77 // _mm_loadu_ps SSE1
78 // _mm_load_ss SSE1
79 // _mm_shuffle_ps SSE1
80 // _mm_mul_ps SSE1
81 // _mm_add_ps SSE1
82 // _mm_storeu_ps SSE1
83
84 __m128 col0 = _mm_loadu_ps(glm::value_ptr(matrix) + 0);
85 __m128 col1 = _mm_loadu_ps(glm::value_ptr(matrix) + 4);
86 __m128 col2 = _mm_loadu_ps(glm::value_ptr(matrix) + 8);
87 __m128 col3 = _mm_loadu_ps(glm::value_ptr(matrix) + 12);
88
89 if (1 < src_count) {
90 __m128 p0 = _mm_load_ps((float*)src); src += src_stride;
91 for (size_t i = 0; i + 2 < src_count; i++) {
92 // x_k = m_00 x_k + m_10 y_k + m_20 z_k + m_30
93 // y_k = m_01 x_k + m_11 y_k + m_21 z_k + m_31
94 // z_k = m_02 x_k + m_12 y_k + m_22 z_k + m_32
95 // w_k = m_03 x_k + m_13 y_k + m_23 z_k + m_33
96 __m128 t00 = _mm_mul_ps(col0, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(0, 0, 0, 0)));
97 __m128 t01 = _mm_mul_ps(col1, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(1, 1, 1, 1)));
98 __m128 t02 = _mm_add_ps(t00, t01);
99 __m128 t03 = _mm_mul_ps(col2, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(2, 2, 2, 2)));
100 p0 = _mm_load_ps((float*)src); src += src_stride;
101 __m128 t04 = _mm_add_ps(t03, col3);
102 __m128 t05 = _mm_add_ps(t02, t04);
103 _mm_stream_ps((float*)dst, t05); dst += dst_stride;
104 }
105 __m128 t00 = _mm_mul_ps(col0, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(0, 0, 0, 0)));
106 __m128 t01 = _mm_mul_ps(col1, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(1, 1, 1, 1)));
107 __m128 t02 = _mm_add_ps(t00, t01);
108 __m128 t03 = _mm_mul_ps(col2, _mm_shuffle_ps(p0, p0, _MM_SHUFFLE(2, 2, 2, 2)));
109 __m128 t04 = _mm_add_ps(t03, col3);
110 __m128 t05 = _mm_add_ps(t02, t04);
111 _mm_stream_ps((float*)dst, t05); dst += dst_stride;
112 }
113
114 // Last element, we use three single-scalar loads to avoid reading
115 // beyond the source buffer
116 __m128 e0 = _mm_load_ss((float*)src); src += sizeof(float);
117 __m128 t0 = _mm_mul_ps(col0, _mm_shuffle_ps(e0, e0, _MM_SHUFFLE(0, 0, 0, 0)));
118
119 __m128 e1 = _mm_load_ss((float*)src); src += sizeof(float);
120 __m128 t1 = _mm_mul_ps(col1, _mm_shuffle_ps(e1, e1, _MM_SHUFFLE(0, 0, 0, 0)));
121 __m128 t2 = _mm_add_ps(t0, t1);
122
123 __m128 e2 = _mm_load_ss((float*)src);
124 __m128 t3 = _mm_mul_ps(col2, _mm_shuffle_ps(e2, e2, _MM_SHUFFLE(0, 0, 0, 0)));
125 __m128 t4 = _mm_add_ps(t2, col3);
126 __m128 t5 = _mm_add_ps(t3, t4);
127
128 _mm_stream_ps((float*)dst, t5); dst += dst_stride;
129 }
130
131}
132#endif
Contains the Engine, Renderer, resource managers and other systems needed to run Cogs....