Cogs.Core
TransformVertices_avx.cpp
1#ifndef EMSCRIPTEN
2
3#ifndef __AVX__
4static_assert(false, "This compile unit must be compiled with AVX");
5#endif
6
7#include <glm/glm.hpp>
8#include <glm/gtc/type_ptr.hpp>
9#include <immintrin.h>
10
11namespace Cogs::Core
12{
13
14 void transformVertex3ToVertex4AVX(uint8_t* dst,
15 const size_t dst_stride,
16 const size_t /*dst_bytes*/,
17 const glm::mat4& matrix,
18 const uint8_t* src,
19 const size_t src_stride,
20 const size_t /*src_bytes*/,
21 const size_t src_count)
22 {
23 if (src_count == 0) {
24 return;
25 }
26 __m128 _col0 = _mm_loadu_ps(glm::value_ptr(matrix) + 0);
27 __m128 _col1 = _mm_loadu_ps(glm::value_ptr(matrix) + 4);
28 __m256 col0 = _mm256_insertf128_ps(_mm256_castps128_ps256(_col0), _col0, 1);
29 __m128 _col2 = _mm_loadu_ps(glm::value_ptr(matrix) + 8);
30 __m256 col1 = _mm256_insertf128_ps(_mm256_castps128_ps256(_col1), _col1, 1);
31 __m128 _col3 = _mm_loadu_ps(glm::value_ptr(matrix) + 12);
32 __m256 col2 = _mm256_insertf128_ps(_mm256_castps128_ps256(_col2), _col2, 1);
33 __m256 col3 = _mm256_insertf128_ps(_mm256_castps128_ps256(_col3), _col3, 1);
34
35 size_t i = 0;
36 if (2 < src_count) {
37 __m128 _p0 = _mm_loadu_ps((float*)src); src += src_stride;
38 __m128 _p1 = _mm_loadu_ps((float*)src); src += src_stride;
39 __m256 p = _mm256_insertf128_ps(_mm256_castps128_ps256(_p0), _p1, 1);
40 for (; i + 4 < src_count; i += 2) {
41 _p0 = _mm_loadu_ps((float*)src); src += src_stride;
42 _p1 = _mm_loadu_ps((float*)src); src += src_stride;
43 __m256 t0 = _mm256_mul_ps(col0, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)));
44 __m256 t1 = _mm256_mul_ps(col1, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)));
45 __m256 t2 = _mm256_add_ps(t0, t1);
46 __m256 t3 = _mm256_mul_ps(col2, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)));
47 p = _mm256_insertf128_ps(_mm256_castps128_ps256(_p0), _p1, 1);
48
49 __m256 t4 = _mm256_add_ps(t3, col3);
50 __m256 t5 = _mm256_add_ps(t2, t4);
51
52 _mm_storeu_ps((float*)dst, _mm256_castps256_ps128(t5)); dst += dst_stride;
53 _mm_storeu_ps((float*)dst, _mm256_extractf128_ps(t5, 1)); dst += dst_stride;
54 }
55
56 __m256 t0 = _mm256_mul_ps(col0, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)));
57 __m256 t1 = _mm256_mul_ps(col1, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)));
58 __m256 t2 = _mm256_add_ps(t0, t1);
59 __m256 t3 = _mm256_mul_ps(col2, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)));
60 __m256 t4 = _mm256_add_ps(t3, col3);
61 __m256 t5 = _mm256_add_ps(t2, t4);
62
63 _mm_storeu_ps((float*)dst, _mm256_castps256_ps128(t5)); dst += dst_stride;
64 _mm_storeu_ps((float*)dst, _mm256_extractf128_ps(t5, 1)); dst += dst_stride;
65 i += 2;
66 }
67
68 if ((src_count & 0x1) == 0) { // even number of elements (excluding 0, i.e., two to go)
69 __m128 p0 = _mm_loadu_ps((float*)src);
70 __m128 p1 = _mm_loadu_ps((float*)(src + src_stride) - 1);
71 __m256 p = _mm256_insertf128_ps(_mm256_castps128_ps256(p0),
72 _mm_shuffle_ps(p1, p1, _MM_SHUFFLE(3, 3, 2, 1)), 1);
73 __m256 t0 = _mm256_mul_ps(col0, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)));
74 __m256 t1 = _mm256_mul_ps(col1, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)));
75 __m256 t2 = _mm256_add_ps(t0, t1);
76 __m256 t3 = _mm256_mul_ps(col2, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)));
77 __m256 t4 = _mm256_add_ps(t3, col3);
78 __m256 t5 = _mm256_add_ps(t2, t4);
79
80 _mm_storeu_ps((float*)dst, _mm256_castps256_ps128(t5)); dst += dst_stride;
81 _mm_storeu_ps((float*)dst, _mm256_extractf128_ps(t5, 1)); dst += dst_stride;
82 }
83 else { // odd number of elements, one to go
84 __m128 e0 = _mm_load_ss((float*)src); src += sizeof(float);
85 __m128 t0 = _mm_mul_ps(_mm256_castps256_ps128(col0), _mm_shuffle_ps(e0, e0, _MM_SHUFFLE(0, 0, 0, 0)));
86
87 __m128 e1 = _mm_load_ss((float*)src); src += sizeof(float);
88 __m128 t1 = _mm_mul_ps(_mm256_castps256_ps128(col1), _mm_shuffle_ps(e1, e1, _MM_SHUFFLE(0, 0, 0, 0)));
89 __m128 t2 = _mm_add_ps(t0, t1);
90
91 __m128 e2 = _mm_load_ss((float*)src); src += sizeof(float) - src_stride;
92 __m128 t3 = _mm_mul_ps(_mm256_castps256_ps128(col2), _mm_shuffle_ps(e2, e2, _MM_SHUFFLE(0, 0, 0, 0)));
93 __m128 t4 = _mm_add_ps(t2, _mm256_castps256_ps128(col3));
94 __m128 t5 = _mm_add_ps(t3, t4);
95
96 _mm_storeu_ps((float*)dst, t5); dst += dst_stride;
97 }
98
99 _mm256_zeroupper();
100 }
101
102
103 void transformVertex3ToVertex4FastPathAVX(uint8_t* dst,
104 const size_t dst_stride,
105 const size_t /*dst_bytes*/,
106 const glm::mat4& matrix,
107 const uint8_t* src,
108 const size_t src_stride,
109 const size_t /*src_bytes*/,
110 const size_t src_count)
111 {
112 if (src_count == 0) {
113 return;
114 }
115 __m128 _col0 = _mm_loadu_ps(glm::value_ptr(matrix) + 0);
116 __m128 _col1 = _mm_loadu_ps(glm::value_ptr(matrix) + 4);
117 __m256 col0 = _mm256_insertf128_ps(_mm256_castps128_ps256(_col0), _col0, 1);
118 __m128 _col2 = _mm_loadu_ps(glm::value_ptr(matrix) + 8);
119 __m256 col1 = _mm256_insertf128_ps(_mm256_castps128_ps256(_col1), _col1, 1);
120 __m128 _col3 = _mm_loadu_ps(glm::value_ptr(matrix) + 12);
121 __m256 col2 = _mm256_insertf128_ps(_mm256_castps128_ps256(_col2), _col2, 1);
122 __m256 col3 = _mm256_insertf128_ps(_mm256_castps128_ps256(_col3), _col3, 1);
123
124 size_t i = 0;
125 if (2 < src_count) {
126 __m128 _p0 = _mm_load_ps((float*)src); src += src_stride;
127 __m128 _p1 = _mm_load_ps((float*)src); src += src_stride;
128 __m256 p = _mm256_insertf128_ps(_mm256_castps128_ps256(_p0), _p1, 1);
129 for (; i + 4 < src_count; i += 2) {
130 _p0 = _mm_load_ps((float*)src); src += src_stride;
131 _p1 = _mm_load_ps((float*)src); src += src_stride;
132 __m256 t0 = _mm256_mul_ps(col0, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)));
133 __m256 t1 = _mm256_mul_ps(col1, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)));
134 __m256 t2 = _mm256_add_ps(t0, t1);
135 __m256 t3 = _mm256_mul_ps(col2, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)));
136 p = _mm256_insertf128_ps(_mm256_castps128_ps256(_p0), _p1, 1);
137
138 __m256 t4 = _mm256_add_ps(t3, col3);
139 __m256 t5 = _mm256_add_ps(t2, t4);
140
141 _mm256_stream_ps((float*)dst, t5); dst += 2 * dst_stride;
142 }
143
144 __m256 t0 = _mm256_mul_ps(col0, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)));
145 __m256 t1 = _mm256_mul_ps(col1, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)));
146 __m256 t2 = _mm256_add_ps(t0, t1);
147 __m256 t3 = _mm256_mul_ps(col2, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)));
148 __m256 t4 = _mm256_add_ps(t3, col3);
149 __m256 t5 = _mm256_add_ps(t2, t4);
150
151 _mm256_stream_ps((float*)dst, t5); dst += 2 * dst_stride;
152 i += 2;
153 }
154
155 if ((src_count & 0x1) == 0) { // even number of elements (excluding 0, i.e., two to go)
156 __m128 p0 = _mm_load_ps((float*)src);
157 __m128 p1 = _mm_loadu_ps((float*)(src + src_stride) - 1);
158 __m256 p = _mm256_insertf128_ps(_mm256_castps128_ps256(p0),
159 _mm_shuffle_ps(p1, p1, _MM_SHUFFLE(3, 3, 2, 1)), 1);
160 __m256 t0 = _mm256_mul_ps(col0, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)));
161 __m256 t1 = _mm256_mul_ps(col1, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)));
162 __m256 t2 = _mm256_add_ps(t0, t1);
163 __m256 t3 = _mm256_mul_ps(col2, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)));
164 __m256 t4 = _mm256_add_ps(t3, col3);
165 __m256 t5 = _mm256_add_ps(t2, t4);
166
167 _mm256_stream_ps((float*)dst, t5); dst += 2 * dst_stride;
168 }
169 else { // odd number of elements, one to go
170 __m128 e0 = _mm_load_ss((float*)src); src += sizeof(float);
171 __m128 t0 = _mm_mul_ps(_mm256_castps256_ps128(col0), _mm_shuffle_ps(e0, e0, _MM_SHUFFLE(0, 0, 0, 0)));
172
173 __m128 e1 = _mm_load_ss((float*)src); src += sizeof(float);
174 __m128 t1 = _mm_mul_ps(_mm256_castps256_ps128(col1), _mm_shuffle_ps(e1, e1, _MM_SHUFFLE(0, 0, 0, 0)));
175 __m128 t2 = _mm_add_ps(t0, t1);
176
177 __m128 e2 = _mm_load_ss((float*)src); src += sizeof(float) - src_stride;
178 __m128 t3 = _mm_mul_ps(_mm256_castps256_ps128(col2), _mm_shuffle_ps(e2, e2, _MM_SHUFFLE(0, 0, 0, 0)));
179 __m128 t4 = _mm_add_ps(t2, _mm256_castps256_ps128(col3));
180 __m128 t5 = _mm_add_ps(t3, t4);
181
182 _mm_stream_ps((float*)dst, t5); dst += dst_stride;
183 }
184
185 _mm256_zeroupper();
186 }
187
188}
189
190#endif
Contains the Engine, Renderer, resource managers and other systems needed to run Cogs....