4static_assert(
false,
"This compile unit must be compiled with AVX");
8#include <glm/gtc/type_ptr.hpp>
14 void transformVertex3ToVertex4AVX(uint8_t* dst,
15 const size_t dst_stride,
17 const glm::mat4& matrix,
19 const size_t src_stride,
21 const size_t src_count)
26 __m128 _col0 = _mm_loadu_ps(glm::value_ptr(matrix) + 0);
27 __m128 _col1 = _mm_loadu_ps(glm::value_ptr(matrix) + 4);
28 __m256 col0 = _mm256_insertf128_ps(_mm256_castps128_ps256(_col0), _col0, 1);
29 __m128 _col2 = _mm_loadu_ps(glm::value_ptr(matrix) + 8);
30 __m256 col1 = _mm256_insertf128_ps(_mm256_castps128_ps256(_col1), _col1, 1);
31 __m128 _col3 = _mm_loadu_ps(glm::value_ptr(matrix) + 12);
32 __m256 col2 = _mm256_insertf128_ps(_mm256_castps128_ps256(_col2), _col2, 1);
33 __m256 col3 = _mm256_insertf128_ps(_mm256_castps128_ps256(_col3), _col3, 1);
37 __m128 _p0 = _mm_loadu_ps((
float*)src); src += src_stride;
38 __m128 _p1 = _mm_loadu_ps((
float*)src); src += src_stride;
39 __m256 p = _mm256_insertf128_ps(_mm256_castps128_ps256(_p0), _p1, 1);
40 for (; i + 4 < src_count; i += 2) {
41 _p0 = _mm_loadu_ps((
float*)src); src += src_stride;
42 _p1 = _mm_loadu_ps((
float*)src); src += src_stride;
43 __m256 t0 = _mm256_mul_ps(col0, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)));
44 __m256 t1 = _mm256_mul_ps(col1, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)));
45 __m256 t2 = _mm256_add_ps(t0, t1);
46 __m256 t3 = _mm256_mul_ps(col2, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)));
47 p = _mm256_insertf128_ps(_mm256_castps128_ps256(_p0), _p1, 1);
49 __m256 t4 = _mm256_add_ps(t3, col3);
50 __m256 t5 = _mm256_add_ps(t2, t4);
52 _mm_storeu_ps((
float*)dst, _mm256_castps256_ps128(t5)); dst += dst_stride;
53 _mm_storeu_ps((
float*)dst, _mm256_extractf128_ps(t5, 1)); dst += dst_stride;
56 __m256 t0 = _mm256_mul_ps(col0, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)));
57 __m256 t1 = _mm256_mul_ps(col1, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)));
58 __m256 t2 = _mm256_add_ps(t0, t1);
59 __m256 t3 = _mm256_mul_ps(col2, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)));
60 __m256 t4 = _mm256_add_ps(t3, col3);
61 __m256 t5 = _mm256_add_ps(t2, t4);
63 _mm_storeu_ps((
float*)dst, _mm256_castps256_ps128(t5)); dst += dst_stride;
64 _mm_storeu_ps((
float*)dst, _mm256_extractf128_ps(t5, 1)); dst += dst_stride;
68 if ((src_count & 0x1) == 0) {
69 __m128 p0 = _mm_loadu_ps((
float*)src);
70 __m128 p1 = _mm_loadu_ps((
float*)(src + src_stride) - 1);
71 __m256 p = _mm256_insertf128_ps(_mm256_castps128_ps256(p0),
72 _mm_shuffle_ps(p1, p1, _MM_SHUFFLE(3, 3, 2, 1)), 1);
73 __m256 t0 = _mm256_mul_ps(col0, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)));
74 __m256 t1 = _mm256_mul_ps(col1, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)));
75 __m256 t2 = _mm256_add_ps(t0, t1);
76 __m256 t3 = _mm256_mul_ps(col2, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)));
77 __m256 t4 = _mm256_add_ps(t3, col3);
78 __m256 t5 = _mm256_add_ps(t2, t4);
80 _mm_storeu_ps((
float*)dst, _mm256_castps256_ps128(t5)); dst += dst_stride;
81 _mm_storeu_ps((
float*)dst, _mm256_extractf128_ps(t5, 1)); dst += dst_stride;
84 __m128 e0 = _mm_load_ss((
float*)src); src +=
sizeof(float);
85 __m128 t0 = _mm_mul_ps(_mm256_castps256_ps128(col0), _mm_shuffle_ps(e0, e0, _MM_SHUFFLE(0, 0, 0, 0)));
87 __m128 e1 = _mm_load_ss((
float*)src); src +=
sizeof(float);
88 __m128 t1 = _mm_mul_ps(_mm256_castps256_ps128(col1), _mm_shuffle_ps(e1, e1, _MM_SHUFFLE(0, 0, 0, 0)));
89 __m128 t2 = _mm_add_ps(t0, t1);
91 __m128 e2 = _mm_load_ss((
float*)src); src +=
sizeof(float) - src_stride;
92 __m128 t3 = _mm_mul_ps(_mm256_castps256_ps128(col2), _mm_shuffle_ps(e2, e2, _MM_SHUFFLE(0, 0, 0, 0)));
93 __m128 t4 = _mm_add_ps(t2, _mm256_castps256_ps128(col3));
94 __m128 t5 = _mm_add_ps(t3, t4);
96 _mm_storeu_ps((
float*)dst, t5); dst += dst_stride;
103 void transformVertex3ToVertex4FastPathAVX(uint8_t* dst,
104 const size_t dst_stride,
106 const glm::mat4& matrix,
108 const size_t src_stride,
110 const size_t src_count)
112 if (src_count == 0) {
115 __m128 _col0 = _mm_loadu_ps(glm::value_ptr(matrix) + 0);
116 __m128 _col1 = _mm_loadu_ps(glm::value_ptr(matrix) + 4);
117 __m256 col0 = _mm256_insertf128_ps(_mm256_castps128_ps256(_col0), _col0, 1);
118 __m128 _col2 = _mm_loadu_ps(glm::value_ptr(matrix) + 8);
119 __m256 col1 = _mm256_insertf128_ps(_mm256_castps128_ps256(_col1), _col1, 1);
120 __m128 _col3 = _mm_loadu_ps(glm::value_ptr(matrix) + 12);
121 __m256 col2 = _mm256_insertf128_ps(_mm256_castps128_ps256(_col2), _col2, 1);
122 __m256 col3 = _mm256_insertf128_ps(_mm256_castps128_ps256(_col3), _col3, 1);
126 __m128 _p0 = _mm_load_ps((
float*)src); src += src_stride;
127 __m128 _p1 = _mm_load_ps((
float*)src); src += src_stride;
128 __m256 p = _mm256_insertf128_ps(_mm256_castps128_ps256(_p0), _p1, 1);
129 for (; i + 4 < src_count; i += 2) {
130 _p0 = _mm_load_ps((
float*)src); src += src_stride;
131 _p1 = _mm_load_ps((
float*)src); src += src_stride;
132 __m256 t0 = _mm256_mul_ps(col0, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)));
133 __m256 t1 = _mm256_mul_ps(col1, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)));
134 __m256 t2 = _mm256_add_ps(t0, t1);
135 __m256 t3 = _mm256_mul_ps(col2, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)));
136 p = _mm256_insertf128_ps(_mm256_castps128_ps256(_p0), _p1, 1);
138 __m256 t4 = _mm256_add_ps(t3, col3);
139 __m256 t5 = _mm256_add_ps(t2, t4);
141 _mm256_stream_ps((
float*)dst, t5); dst += 2 * dst_stride;
144 __m256 t0 = _mm256_mul_ps(col0, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)));
145 __m256 t1 = _mm256_mul_ps(col1, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)));
146 __m256 t2 = _mm256_add_ps(t0, t1);
147 __m256 t3 = _mm256_mul_ps(col2, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)));
148 __m256 t4 = _mm256_add_ps(t3, col3);
149 __m256 t5 = _mm256_add_ps(t2, t4);
151 _mm256_stream_ps((
float*)dst, t5); dst += 2 * dst_stride;
155 if ((src_count & 0x1) == 0) {
156 __m128 p0 = _mm_load_ps((
float*)src);
157 __m128 p1 = _mm_loadu_ps((
float*)(src + src_stride) - 1);
158 __m256 p = _mm256_insertf128_ps(_mm256_castps128_ps256(p0),
159 _mm_shuffle_ps(p1, p1, _MM_SHUFFLE(3, 3, 2, 1)), 1);
160 __m256 t0 = _mm256_mul_ps(col0, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)));
161 __m256 t1 = _mm256_mul_ps(col1, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)));
162 __m256 t2 = _mm256_add_ps(t0, t1);
163 __m256 t3 = _mm256_mul_ps(col2, _mm256_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)));
164 __m256 t4 = _mm256_add_ps(t3, col3);
165 __m256 t5 = _mm256_add_ps(t2, t4);
167 _mm256_stream_ps((
float*)dst, t5); dst += 2 * dst_stride;
170 __m128 e0 = _mm_load_ss((
float*)src); src +=
sizeof(float);
171 __m128 t0 = _mm_mul_ps(_mm256_castps256_ps128(col0), _mm_shuffle_ps(e0, e0, _MM_SHUFFLE(0, 0, 0, 0)));
173 __m128 e1 = _mm_load_ss((
float*)src); src +=
sizeof(float);
174 __m128 t1 = _mm_mul_ps(_mm256_castps256_ps128(col1), _mm_shuffle_ps(e1, e1, _MM_SHUFFLE(0, 0, 0, 0)));
175 __m128 t2 = _mm_add_ps(t0, t1);
177 __m128 e2 = _mm_load_ss((
float*)src); src +=
sizeof(float) - src_stride;
178 __m128 t3 = _mm_mul_ps(_mm256_castps256_ps128(col2), _mm_shuffle_ps(e2, e2, _MM_SHUFFLE(0, 0, 0, 0)));
179 __m128 t4 = _mm_add_ps(t2, _mm256_castps256_ps128(col3));
180 __m128 t5 = _mm_add_ps(t3, t4);
182 _mm_stream_ps((
float*)dst, t5); dst += dst_stride;
Contains the Engine, Renderer, resource managers and other systems needed to run Cogs....