16 void boxFilterScanlineSSE(
float* dst,
22 auto r = std::max(0.f, std::min(
static_cast<float>(N / 2) - 1, radius));
24 auto s = _mm_set_ps1(1.f / (2.f*r + 1.f));
25 auto n =
static_cast<uint32_t
>(r);
26 auto f = _mm_set_ps1(r - n);
27 auto q = _mm_sub_ps(_mm_set_ps1(1), f);
29 __m128 sum = _mm_load_ps(src);
30 for (uint32_t k = 1; k <= n; k++) {
31 sum = _mm_add_ps(sum, _mm_load_ps(src + 4 * k));
33 sum = _mm_add_ps(sum, _mm_mul_ps(f, _mm_load_ps(src + 4 * (n + 1))));
38 _mm_store_ps(dst + 4 * i, _mm_mul_ps(s, sum));
39 sum = _mm_add_ps(sum, _mm_mul_ps(q, _mm_load_ps(src + 4 * (i + n + 1))));
40 sum = _mm_add_ps(sum, _mm_mul_ps(f, _mm_load_ps(src + 4 * (i + n + 2))));
41 if (i - n < N) sum = _mm_sub_ps(sum, _mm_mul_ps(q, _mm_load_ps(src + 4 * (i - n))));
42 if (i - n - 1 < N) sum = _mm_sub_ps(sum, _mm_mul_ps(f, _mm_load_ps(src + 4 * (i - n - 1))));
44 for (; i < N - n - 2; i++) {
45 _mm_store_ps(dst + 4 * i, _mm_mul_ps(s, sum));
46 sum = _mm_add_ps(sum, _mm_mul_ps(q, _mm_load_ps(src + 4 * (i + n + 1))));
47 sum = _mm_add_ps(sum, _mm_mul_ps(f, _mm_load_ps(src + 4 * (i + n + 2))));
48 sum = _mm_sub_ps(sum, _mm_mul_ps(q, _mm_load_ps(src + 4 * (i - n))));
49 sum = _mm_sub_ps(sum, _mm_mul_ps(f, _mm_load_ps(src + 4 * (i - n - 1))));
52 _mm_store_ps(dst + 4 * i, _mm_mul_ps(s, sum));
53 if (i + n + 1 < N) sum = _mm_add_ps(sum, _mm_mul_ps(q, _mm_load_ps(src + 4 * (i + n + 1))));
54 if (i + n + 2 < N) sum = _mm_add_ps(sum, _mm_mul_ps(f, _mm_load_ps(src + 4 * (i + n + 2))));
55 sum = _mm_sub_ps(sum, _mm_mul_ps(q, _mm_load_ps(src + 4 * (i - n))));
56 sum = _mm_sub_ps(sum, _mm_mul_ps(f, _mm_load_ps(src + 4 * (i - n - 1))));
60 for (uint32_t i = 0; i < N; i++) {
61 _mm_store_ps(dst + 4 * i, _mm_mul_ps(s, sum));
62 if (i + n + 1 < N) sum = _mm_add_ps(sum, _mm_mul_ps(q, _mm_load_ps(src + 4 * (i + n + 1))));
63 if (i + n + 2 < N) sum = _mm_add_ps(sum, _mm_mul_ps(f, _mm_load_ps(src + 4 * (i + n + 2))));
64 if (i - n < N) sum = _mm_sub_ps(sum, _mm_mul_ps(q, _mm_load_ps(src + 4 * (i - n))));
65 if (i - n - 1 < N) sum = _mm_sub_ps(sum, _mm_mul_ps(f, _mm_load_ps(src + 4 * (i - n - 1))));
72void Blur::BoxFilterPassXTaskSSE::operator()()
74 for (uint32_t k = a; k < b; k++) {
75 for (uint32_t j = 0; j < size.y; j += 4) {
76 for (uint32_t i = 0; i < size.x; i += 4) {
77 auto A = _mm_load_ps(src + (k*size.y + j + 0)*size.x + i);
78 auto B = _mm_load_ps(src + (k*size.y + j + 1)*size.x + i);
79 auto C = _mm_load_ps(src + (k*size.y + j + 2)*size.x + i);
80 auto D = _mm_load_ps(src + (k*size.y + j + 3)*size.x + i);
81 _MM_TRANSPOSE4_PS(A, B, C, D);
82 _mm_store_ps(ptrA + 4 * (i + 0), A);
83 _mm_store_ps(ptrA + 4 * (i + 1), B);
84 _mm_store_ps(ptrA + 4 * (i + 2), C);
85 _mm_store_ps(ptrA + 4 * (i + 3), D);
87 for (uint32_t l = 0; l < iterations; l++) {
88 boxFilterScanlineSSE(ptrB, ptrA, size.x, radius);
89 std::swap(ptrA, ptrB);
91 for (uint32_t i = 0; i < size.x; i += 4) {
92 auto A = _mm_load_ps(ptrA + 4 * (i + 0));
93 auto B = _mm_load_ps(ptrA + 4 * (i + 1));
94 auto C = _mm_load_ps(ptrA + 4 * (i + 2));
95 auto D = _mm_load_ps(ptrA + 4 * (i + 3));
96 _MM_TRANSPOSE4_PS(A, B, C, D);
97 _mm_store_ps(dst + (k*size.y + j + 0)*size.x + i, A);
98 _mm_store_ps(dst + (k*size.y + j + 1)*size.x + i, B);
99 _mm_store_ps(dst + (k*size.y + j + 2)*size.x + i, C);
100 _mm_store_ps(dst + (k*size.y + j + 3)*size.x + i, D);
106void Blur::BoxFilterPassYTaskSSE::operator()()
108 for (uint32_t k = a; k < b; k++) {
109 for (uint32_t i = 0; i < size.x; i += 4) {
110 for (uint32_t j = 0; j < size.y; j++) {
111 _mm_store_ps(ptrA + 4 * j, _mm_load_ps(src + (k*size.y + j)*size.x + i));
113 for (uint32_t l = 0; l < iterations; l++) {
114 boxFilterScanlineSSE(ptrB, ptrA, size.y, radius);
115 std::swap(ptrA, ptrB);
117 for (uint32_t j = 0; j < size.y; j++) {
118 _mm_store_ps(dst + (k*size.y + j)*size.x + i, _mm_load_ps(ptrA + 4 * j));
125void Blur::BoxFilterPassZTaskSSE::operator()()
128 for (uint32_t j = a; j < b; j++) {
131 for (; i + 12 < size.x; i += 16) {
132 for (uint32_t k = 0; k < size.z; k++) {
133 _mm_store_ps(ptrA + 4 * (k + 0 * size.y), _mm_load_ps(src + (k*size.y + j)*size.x + i + 0));
134 _mm_store_ps(ptrA + 4 * (k + 1 * size.y), _mm_load_ps(src + (k*size.y + j)*size.x + i + 4));
135 _mm_store_ps(ptrA + 4 * (k + 2 * size.y), _mm_load_ps(src + (k*size.y + j)*size.x + i + 8));
136 _mm_store_ps(ptrA + 4 * (k + 3 * size.y), _mm_load_ps(src + (k*size.y + j)*size.x + i + 12));
138 for (uint32_t l = 0; l < iterations; l++) {
139 boxFilterScanlineSSE(ptrB + 4 * 0 * size.y, ptrA + 4 * 0 * size.y, size.z, radius);
140 boxFilterScanlineSSE(ptrB + 4 * 1 * size.y, ptrA + 4 * 1 * size.y, size.z, radius);
141 boxFilterScanlineSSE(ptrB + 4 * 2 * size.y, ptrA + 4 * 2 * size.y, size.z, radius);
142 boxFilterScanlineSSE(ptrB + 4 * 3 * size.y, ptrA + 4 * 3 * size.y, size.z, radius);
143 std::swap(ptrA, ptrB);
145 for (uint32_t k = 0; k < size.z; k++) {
146 _mm_store_ps(dst + (k*size.y + j)*size.x + i + 0, _mm_load_ps(ptrA + 4 * (k + 0 * size.y)));
147 _mm_store_ps(dst + (k*size.y + j)*size.x + i + 4, _mm_load_ps(ptrA + 4 * (k + 1 * size.y)));
148 _mm_store_ps(dst + (k*size.y + j)*size.x + i + 8, _mm_load_ps(ptrA + 4 * (k + 2 * size.y)));
149 _mm_store_ps(dst + (k*size.y + j)*size.x + i + 12, _mm_load_ps(ptrA + 4 * (k + 3 * size.y)));
153 for (uint32_t k = 0; k < size.z; k++) {
154 _mm_store_ps(ptrA + 4 * (k + 0 * size.y), _mm_load_ps(src + (k*size.y + j)*size.x + i + 0));
155 _mm_store_ps(ptrA + 4 * (k + 1 * size.y), i + 4 < size.x ? _mm_load_ps(src + (k*size.y + j)*size.x + i + 4) : _mm_setzero_ps());
156 _mm_store_ps(ptrA + 4 * (k + 2 * size.y), i + 8 < size.x ? _mm_load_ps(src + (k*size.y + j)*size.x + i + 8) : _mm_setzero_ps());
157 _mm_store_ps(ptrA + 4 * (k + 3 * size.y), i + 12 < size.x ? _mm_load_ps(src + (k*size.y + j)*size.x + i + 12) : _mm_setzero_ps());
159 for (uint32_t l = 0; l < iterations; l++) {
160 boxFilterScanlineSSE(ptrB + 4 * 0 * size.y, ptrA + 4 * 0 * size.y, size.z, radius);
161 boxFilterScanlineSSE(ptrB + 4 * 1 * size.y, ptrA + 4 * 1 * size.y, size.z, radius);
162 boxFilterScanlineSSE(ptrB + 4 * 2 * size.y, ptrA + 4 * 2 * size.y, size.z, radius);
163 boxFilterScanlineSSE(ptrB + 4 * 3 * size.y, ptrA + 4 * 3 * size.y, size.z, radius);
164 std::swap(ptrA, ptrB);
166 for (uint32_t k = 0; k < size.z; k++) {
167 _mm_store_ps(dst + (k*size.y + j)*size.x + i + 0, _mm_load_ps(ptrA + 4 * (k + 0 * size.y)));
168 if (i + 4 < size.x) _mm_store_ps(dst + (k*size.y + j)*size.x + i + 4, _mm_load_ps(ptrA + 4 * (k + 1 * size.y)));
169 if (i + 8 < size.x)_mm_store_ps(dst + (k*size.y + j)*size.x + i + 8, _mm_load_ps(ptrA + 4 * (k + 2 * size.y)));
170 if (i + 12 < size.x)_mm_store_ps(dst + (k*size.y + j)*size.x + i + 12, _mm_load_ps(ptrA + 4 * (k + 3 * size.y)));
176 for (uint32_t j = a; j < b; j++) {
177 for (uint32_t i = 0; i < size.x; i += 16) {
178 for (uint32_t k = 0; k < size.z; k++) {
179 _mm_store_ps(ptrA + 4 * (k + 0 * size.y), _mm_load_ps(src + (k*size.y + j)*size.x + i + 0));
180 _mm_store_ps(ptrA + 4 * (k + 1 * size.y), i + 4 < size.x ? _mm_load_ps(src + (k*size.y + j)*size.x + i + 4) : _mm_setzero_ps());
181 _mm_store_ps(ptrA + 4 * (k + 2 * size.y), i + 8 < size.x ? _mm_load_ps(src + (k*size.y + j)*size.x + i + 8) : _mm_setzero_ps());
182 _mm_store_ps(ptrA + 4 * (k + 3 * size.y), i + 12 < size.x ? _mm_load_ps(src + (k*size.y + j)*size.x + i + 12) : _mm_setzero_ps());
184 for (uint32_t l = 0; l < iterations; l++) {
185 boxFilterScanlineSSE(ptrB + 4 * 0 * size.y, ptrA + 4 * 0 * size.y, size.z, radius);
186 boxFilterScanlineSSE(ptrB + 4 * 1 * size.y, ptrA + 4 * 1 * size.y, size.z, radius);
187 boxFilterScanlineSSE(ptrB + 4 * 2 * size.y, ptrA + 4 * 2 * size.y, size.z, radius);
188 boxFilterScanlineSSE(ptrB + 4 * 3 * size.y, ptrA + 4 * 3 * size.y, size.z, radius);
189 std::swap(ptrA, ptrB);
191 for (uint32_t k = 0; k < size.z; k++) {
192 _mm_store_ps(dst + (k*size.y + j)*size.x + i + 0, _mm_load_ps(ptrA + 4 * (k + 0 * size.y)));
193 if (i + 4 < size.x) _mm_store_ps(dst + (k*size.y + j)*size.x + i + 4, _mm_load_ps(ptrA + 4 * (k + 1 * size.y)));
194 if (i + 8 < size.x)_mm_store_ps(dst + (k*size.y + j)*size.x + i + 8, _mm_load_ps(ptrA + 4 * (k + 2 * size.y)));
195 if (i + 12 < size.x)_mm_store_ps(dst + (k*size.y + j)*size.x + i + 12, _mm_load_ps(ptrA + 4 * (k + 3 * size.y)));
200 for (uint32_t j = a; j < b; j++) {
201 for (uint32_t i = 0; i < size.x; i += 4) {
202 for (uint32_t k = 0; k < size.z; k++) {
203 _mm_store_ps(ptrA + 4 * k, _mm_load_ps(src + (k*size.y + j)*size.x + i));
205 for (uint32_t l = 0; l < iterations; l++) {
206 boxFilterScanlineSSE(ptrB, ptrA, size.z, radius);
207 std::swap(ptrA, ptrB);
209 for (uint32_t k = 0; k < size.z; k++) {
210 _mm_store_ps(dst + (k*size.y + j)*size.x + i, _mm_load_ps(ptrA + 4 * k));
Contains the Engine, Renderer, resource managers and other systems needed to run Cogs....