Cogs.Core
Blur_SSE.cpp
1#ifndef EMSCRIPTEN
2
3#ifdef __linux__
4 #include <x86intrin.h>
5#else
6 #include <intrin.h>
7#endif
8#include <algorithm>
9#include "Blur.h"
10#include "Context.h"
11
12using namespace Cogs::Core;
13
14namespace {
15
16 void boxFilterScanlineSSE(float* dst,
17 const float* src,
18 const uint32_t N,
19 const float radius)
20 {
21 // Assert assumptions on radius.
22 auto r = std::max(0.f, std::min(static_cast<float>(N / 2) - 1, radius));
23
24 auto s = _mm_set_ps1(1.f / (2.f*r + 1.f));
25 auto n = static_cast<uint32_t>(r);
26 auto f = _mm_set_ps1(r - n);
27 auto q = _mm_sub_ps(_mm_set_ps1(1), f);
28
29 __m128 sum = _mm_load_ps(src);
30 for (uint32_t k = 1; k <= n; k++) {
31 sum = _mm_add_ps(sum, _mm_load_ps(src + 4 * k));
32 }
33 sum = _mm_add_ps(sum, _mm_mul_ps(f, _mm_load_ps(src + 4 * (n + 1))));
34
35#if 1
36 uint32_t i = 0;
37 for (; i <= n; i++) {
38 _mm_store_ps(dst + 4 * i, _mm_mul_ps(s, sum));
39 sum = _mm_add_ps(sum, _mm_mul_ps(q, _mm_load_ps(src + 4 * (i + n + 1))));
40 sum = _mm_add_ps(sum, _mm_mul_ps(f, _mm_load_ps(src + 4 * (i + n + 2))));
41 if (i - n < N) sum = _mm_sub_ps(sum, _mm_mul_ps(q, _mm_load_ps(src + 4 * (i - n)))); // if needed until n <= i.
42 if (i - n - 1 < N) sum = _mm_sub_ps(sum, _mm_mul_ps(f, _mm_load_ps(src + 4 * (i - n - 1)))); // if needed until n < i.
43 }
44 for (; i < N - n - 2; i++) { // run until second if in next loop can trigger.
45 _mm_store_ps(dst + 4 * i, _mm_mul_ps(s, sum));
46 sum = _mm_add_ps(sum, _mm_mul_ps(q, _mm_load_ps(src + 4 * (i + n + 1))));
47 sum = _mm_add_ps(sum, _mm_mul_ps(f, _mm_load_ps(src + 4 * (i + n + 2)))); // assume N - n - 1 + n + 1 <= N <=> N <= N
48 sum = _mm_sub_ps(sum, _mm_mul_ps(q, _mm_load_ps(src + 4 * (i - n))));
49 sum = _mm_sub_ps(sum, _mm_mul_ps(f, _mm_load_ps(src + 4 * (i - n - 1))));
50 }
51 for (; i < N; i++) {
52 _mm_store_ps(dst + 4 * i, _mm_mul_ps(s, sum));
53 if (i + n + 1 < N) sum = _mm_add_ps(sum, _mm_mul_ps(q, _mm_load_ps(src + 4 * (i + n + 1)))); // if needed unless i < N - n - 1
54 if (i + n + 2 < N) sum = _mm_add_ps(sum, _mm_mul_ps(f, _mm_load_ps(src + 4 * (i + n + 2)))); // if needed unless i < N - n - 2
55 sum = _mm_sub_ps(sum, _mm_mul_ps(q, _mm_load_ps(src + 4 * (i - n))));
56 sum = _mm_sub_ps(sum, _mm_mul_ps(f, _mm_load_ps(src + 4 * (i - n - 1))));
57 }
58
59#else
60 for (uint32_t i = 0; i < N; i++) {
61 _mm_store_ps(dst + 4 * i, _mm_mul_ps(s, sum));
62 if (i + n + 1 < N) sum = _mm_add_ps(sum, _mm_mul_ps(q, _mm_load_ps(src + 4 * (i + n + 1))));
63 if (i + n + 2 < N) sum = _mm_add_ps(sum, _mm_mul_ps(f, _mm_load_ps(src + 4 * (i + n + 2))));
64 if (i - n < N) sum = _mm_sub_ps(sum, _mm_mul_ps(q, _mm_load_ps(src + 4 * (i - n))));
65 if (i - n - 1 < N) sum = _mm_sub_ps(sum, _mm_mul_ps(f, _mm_load_ps(src + 4 * (i - n - 1))));
66 }
67#endif
68 }
69
70}
71
72void Blur::BoxFilterPassXTaskSSE::operator()()
73{
74 for (uint32_t k = a; k < b; k++) {
75 for (uint32_t j = 0; j < size.y; j += 4) {
76 for (uint32_t i = 0; i < size.x; i += 4) {
77 auto A = _mm_load_ps(src + (k*size.y + j + 0)*size.x + i);
78 auto B = _mm_load_ps(src + (k*size.y + j + 1)*size.x + i);
79 auto C = _mm_load_ps(src + (k*size.y + j + 2)*size.x + i);
80 auto D = _mm_load_ps(src + (k*size.y + j + 3)*size.x + i);
81 _MM_TRANSPOSE4_PS(A, B, C, D);
82 _mm_store_ps(ptrA + 4 * (i + 0), A);
83 _mm_store_ps(ptrA + 4 * (i + 1), B);
84 _mm_store_ps(ptrA + 4 * (i + 2), C);
85 _mm_store_ps(ptrA + 4 * (i + 3), D);
86 }
87 for (uint32_t l = 0; l < iterations; l++) {
88 boxFilterScanlineSSE(ptrB, ptrA, size.x, radius);
89 std::swap(ptrA, ptrB);
90 }
91 for (uint32_t i = 0; i < size.x; i += 4) {
92 auto A = _mm_load_ps(ptrA + 4 * (i + 0));
93 auto B = _mm_load_ps(ptrA + 4 * (i + 1));
94 auto C = _mm_load_ps(ptrA + 4 * (i + 2));
95 auto D = _mm_load_ps(ptrA + 4 * (i + 3));
96 _MM_TRANSPOSE4_PS(A, B, C, D);
97 _mm_store_ps(dst + (k*size.y + j + 0)*size.x + i, A);
98 _mm_store_ps(dst + (k*size.y + j + 1)*size.x + i, B);
99 _mm_store_ps(dst + (k*size.y + j + 2)*size.x + i, C);
100 _mm_store_ps(dst + (k*size.y + j + 3)*size.x + i, D);
101 }
102 }
103 }
104}
105
106void Blur::BoxFilterPassYTaskSSE::operator()()
107{
108 for (uint32_t k = a; k < b; k++) {
109 for (uint32_t i = 0; i < size.x; i += 4) {
110 for (uint32_t j = 0; j < size.y; j++) {
111 _mm_store_ps(ptrA + 4 * j, _mm_load_ps(src + (k*size.y + j)*size.x + i));
112 }
113 for (uint32_t l = 0; l < iterations; l++) {
114 boxFilterScanlineSSE(ptrB, ptrA, size.y, radius);
115 std::swap(ptrA, ptrB);
116 }
117 for (uint32_t j = 0; j < size.y; j++) {
118 _mm_store_ps(dst + (k*size.y + j)*size.x + i, _mm_load_ps(ptrA + 4 * j));
119 }
120 }
121 }
122}
123
124
125void Blur::BoxFilterPassZTaskSSE::operator()()
126{
127#if 1
128 for (uint32_t j = a; j < b; j++) {
129
130 uint32_t i = 0;
131 for (; i + 12 < size.x; i += 16) {
132 for (uint32_t k = 0; k < size.z; k++) {
133 _mm_store_ps(ptrA + 4 * (k + 0 * size.y), _mm_load_ps(src + (k*size.y + j)*size.x + i + 0));
134 _mm_store_ps(ptrA + 4 * (k + 1 * size.y), _mm_load_ps(src + (k*size.y + j)*size.x + i + 4));
135 _mm_store_ps(ptrA + 4 * (k + 2 * size.y), _mm_load_ps(src + (k*size.y + j)*size.x + i + 8));
136 _mm_store_ps(ptrA + 4 * (k + 3 * size.y), _mm_load_ps(src + (k*size.y + j)*size.x + i + 12));
137 }
138 for (uint32_t l = 0; l < iterations; l++) {
139 boxFilterScanlineSSE(ptrB + 4 * 0 * size.y, ptrA + 4 * 0 * size.y, size.z, radius);
140 boxFilterScanlineSSE(ptrB + 4 * 1 * size.y, ptrA + 4 * 1 * size.y, size.z, radius);
141 boxFilterScanlineSSE(ptrB + 4 * 2 * size.y, ptrA + 4 * 2 * size.y, size.z, radius);
142 boxFilterScanlineSSE(ptrB + 4 * 3 * size.y, ptrA + 4 * 3 * size.y, size.z, radius);
143 std::swap(ptrA, ptrB);
144 }
145 for (uint32_t k = 0; k < size.z; k++) {
146 _mm_store_ps(dst + (k*size.y + j)*size.x + i + 0, _mm_load_ps(ptrA + 4 * (k + 0 * size.y)));
147 _mm_store_ps(dst + (k*size.y + j)*size.x + i + 4, _mm_load_ps(ptrA + 4 * (k + 1 * size.y)));
148 _mm_store_ps(dst + (k*size.y + j)*size.x + i + 8, _mm_load_ps(ptrA + 4 * (k + 2 * size.y)));
149 _mm_store_ps(dst + (k*size.y + j)*size.x + i + 12, _mm_load_ps(ptrA + 4 * (k + 3 * size.y)));
150 }
151 }
152 if (i < size.x) {
153 for (uint32_t k = 0; k < size.z; k++) {
154 _mm_store_ps(ptrA + 4 * (k + 0 * size.y), _mm_load_ps(src + (k*size.y + j)*size.x + i + 0));
155 _mm_store_ps(ptrA + 4 * (k + 1 * size.y), i + 4 < size.x ? _mm_load_ps(src + (k*size.y + j)*size.x + i + 4) : _mm_setzero_ps());
156 _mm_store_ps(ptrA + 4 * (k + 2 * size.y), i + 8 < size.x ? _mm_load_ps(src + (k*size.y + j)*size.x + i + 8) : _mm_setzero_ps());
157 _mm_store_ps(ptrA + 4 * (k + 3 * size.y), i + 12 < size.x ? _mm_load_ps(src + (k*size.y + j)*size.x + i + 12) : _mm_setzero_ps());
158 }
159 for (uint32_t l = 0; l < iterations; l++) {
160 boxFilterScanlineSSE(ptrB + 4 * 0 * size.y, ptrA + 4 * 0 * size.y, size.z, radius);
161 boxFilterScanlineSSE(ptrB + 4 * 1 * size.y, ptrA + 4 * 1 * size.y, size.z, radius);
162 boxFilterScanlineSSE(ptrB + 4 * 2 * size.y, ptrA + 4 * 2 * size.y, size.z, radius);
163 boxFilterScanlineSSE(ptrB + 4 * 3 * size.y, ptrA + 4 * 3 * size.y, size.z, radius);
164 std::swap(ptrA, ptrB);
165 }
166 for (uint32_t k = 0; k < size.z; k++) {
167 _mm_store_ps(dst + (k*size.y + j)*size.x + i + 0, _mm_load_ps(ptrA + 4 * (k + 0 * size.y)));
168 if (i + 4 < size.x) _mm_store_ps(dst + (k*size.y + j)*size.x + i + 4, _mm_load_ps(ptrA + 4 * (k + 1 * size.y)));
169 if (i + 8 < size.x)_mm_store_ps(dst + (k*size.y + j)*size.x + i + 8, _mm_load_ps(ptrA + 4 * (k + 2 * size.y)));
170 if (i + 12 < size.x)_mm_store_ps(dst + (k*size.y + j)*size.x + i + 12, _mm_load_ps(ptrA + 4 * (k + 3 * size.y)));
171 }
172 }
173 }
174
175#elif 1
176 for (uint32_t j = a; j < b; j++) {
177 for (uint32_t i = 0; i < size.x; i += 16) {
178 for (uint32_t k = 0; k < size.z; k++) {
179 _mm_store_ps(ptrA + 4 * (k + 0 * size.y), _mm_load_ps(src + (k*size.y + j)*size.x + i + 0));
180 _mm_store_ps(ptrA + 4 * (k + 1 * size.y), i + 4 < size.x ? _mm_load_ps(src + (k*size.y + j)*size.x + i + 4) : _mm_setzero_ps());
181 _mm_store_ps(ptrA + 4 * (k + 2 * size.y), i + 8 < size.x ? _mm_load_ps(src + (k*size.y + j)*size.x + i + 8) : _mm_setzero_ps());
182 _mm_store_ps(ptrA + 4 * (k + 3 * size.y), i + 12 < size.x ? _mm_load_ps(src + (k*size.y + j)*size.x + i + 12) : _mm_setzero_ps());
183 }
184 for (uint32_t l = 0; l < iterations; l++) {
185 boxFilterScanlineSSE(ptrB + 4 * 0 * size.y, ptrA + 4 * 0 * size.y, size.z, radius);
186 boxFilterScanlineSSE(ptrB + 4 * 1 * size.y, ptrA + 4 * 1 * size.y, size.z, radius);
187 boxFilterScanlineSSE(ptrB + 4 * 2 * size.y, ptrA + 4 * 2 * size.y, size.z, radius);
188 boxFilterScanlineSSE(ptrB + 4 * 3 * size.y, ptrA + 4 * 3 * size.y, size.z, radius);
189 std::swap(ptrA, ptrB);
190 }
191 for (uint32_t k = 0; k < size.z; k++) {
192 _mm_store_ps(dst + (k*size.y + j)*size.x + i + 0, _mm_load_ps(ptrA + 4 * (k + 0 * size.y)));
193 if (i + 4 < size.x) _mm_store_ps(dst + (k*size.y + j)*size.x + i + 4, _mm_load_ps(ptrA + 4 * (k + 1 * size.y)));
194 if (i + 8 < size.x)_mm_store_ps(dst + (k*size.y + j)*size.x + i + 8, _mm_load_ps(ptrA + 4 * (k + 2 * size.y)));
195 if (i + 12 < size.x)_mm_store_ps(dst + (k*size.y + j)*size.x + i + 12, _mm_load_ps(ptrA + 4 * (k + 3 * size.y)));
196 }
197 }
198 }
199#else
200 for (uint32_t j = a; j < b; j++) {
201 for (uint32_t i = 0; i < size.x; i += 4) {
202 for (uint32_t k = 0; k < size.z; k++) {
203 _mm_store_ps(ptrA + 4 * k, _mm_load_ps(src + (k*size.y + j)*size.x + i));
204 }
205 for (uint32_t l = 0; l < iterations; l++) {
206 boxFilterScanlineSSE(ptrB, ptrA, size.z, radius);
207 std::swap(ptrA, ptrB);
208 }
209 for (uint32_t k = 0; k < size.z; k++) {
210 _mm_store_ps(dst + (k*size.y + j)*size.x + i, _mm_load_ps(ptrA + 4 * k));
211 }
212 }
213 }
214#endif
215}
216
217#endif
Contains the Engine, Renderer, resource managers and other systems needed to run Cogs....