DIE Engine
Loading...
Searching...
No Matches
postfx.h
Go to the documentation of this file.
1
11
12#ifndef POSTFX_H
13#define POSTFX_H
14
15#include <smmintrin.h>
16
17#include <stdint.h>
18
19/*****************************************************************************/
27inline void motionBlurSSE4(uint32_t * frame, uint32_t * previous, uint16_t blend, size_t size)
28{
29// Halved factor: keeps delta * blend within the signed 16-bit lanes
30 __m128i blend1 = _mm_set1_epi16(blend >> 1);
31
32 for (size_t i = 0; i + 4 <= size; i += 4) {
33 // Load current (accumulated) and previous frames
34 __m128i curr = _mm_loadu_si128((__m128i*) &frame[i]);
35 __m128i prev = _mm_loadu_si128((__m128i*) &previous[i]);
36
37 // Unpack to 16-bit channels
38 __m128i curr_lo = _mm_unpacklo_epi8(curr, _mm_setzero_si128());
39 __m128i curr_hi = _mm_unpackhi_epi8(curr, _mm_setzero_si128());
40 __m128i prev_lo = _mm_unpacklo_epi8(prev, _mm_setzero_si128());
41 __m128i prev_hi = _mm_unpackhi_epi8(prev, _mm_setzero_si128());
42
43 // delta = (prev - curr)
44 __m128i delta_lo = _mm_sub_epi16(prev_lo, curr_lo);
45 __m128i delta_hi = _mm_sub_epi16(prev_hi, curr_hi);
46
47 // weighted delta
48 __m128i delta_lo_w = _mm_mullo_epi16(delta_lo, blend1);
49 __m128i delta_hi_w = _mm_mullo_epi16(delta_hi, blend1);
50
51 // result = curr + ((prev - curr) * b1) >> 7
52 __m128i blended_lo = _mm_add_epi16(curr_lo, _mm_srai_epi16(delta_lo_w, 7));
53 __m128i blended_hi = _mm_add_epi16(curr_hi, _mm_srai_epi16(delta_hi_w, 7));
54
55 // Pack and store back to both frames
56 __m128i result = _mm_packus_epi16(blended_lo, blended_hi);
57 _mm_storeu_si128((__m128i*) &frame[i], result);
58 _mm_storeu_si128((__m128i*) &previous[i], result);
59 }
60}
61
62/*****************************************************************************/
70inline void fadeToSSE4(uint32_t * frame, size_t size, uint32_t color, uint16_t factor)
71{
72 __m128i c = _mm_set1_epi32(color);
73
74// Unpack color to 16-bit lanes (B,G,R,A)
75 __m128i c_lo = _mm_unpacklo_epi8(c, _mm_setzero_si128());
76 __m128i c_hi = _mm_unpackhi_epi8(c, _mm_setzero_si128());
77
78// Halved factor: keeps (src - color) * f within the signed 16-bit lanes
79 __m128i f = _mm_set1_epi16((short) ((256 - factor) >> 1));
80
81 for (size_t i = 0; i + 4 <= size; i += 4) {
82 __m128i pix = _mm_loadu_si128((__m128i *) &frame[i]);
83
84 // Unpack pixels to 16-bit lanes
85 __m128i lo = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
86 __m128i hi = _mm_unpackhi_epi8(pix, _mm_setzero_si128());
87
88 // (src - color) * f
89 lo = _mm_mullo_epi16(_mm_sub_epi16(lo, c_lo), f);
90 hi = _mm_mullo_epi16(_mm_sub_epi16(hi, c_hi), f);
91
92 // Divide by 128 (>> 7, halved factor)
93 lo = _mm_srai_epi16(lo, 7);
94 hi = _mm_srai_epi16(hi, 7);
95
96 // Add color back
97 lo = _mm_add_epi16(lo, c_lo);
98 hi = _mm_add_epi16(hi, c_hi);
99
100 // Pack back to 8-bit
101 __m128i blended = _mm_packus_epi16(lo, hi);
102 _mm_storeu_si128((__m128i*) &frame[i], blended);
103 }
104}
105
106/*****************************************************************************/
111inline void darken4SSE4(uint32_t * p0, uint32_t * p1, uint32_t * p2, uint32_t * p3, uint16_t factor)
112{
113// Load 4 uint32_t pixels into 128-bit register
114 __m128i pixels = _mm_set_epi32(*p3, *p2, *p1, *p0);
115
116// Convert to 4x (A,R,G,B) in 16-bit lanes
117 __m128i lo = _mm_unpacklo_epi8(pixels, _mm_setzero_si128()); // pixels 0 + 1
118 __m128i hi = _mm_unpackhi_epi8(pixels, _mm_setzero_si128()); // pixels 2 + 3
119
120// Multiply by factor
121 __m128i f = _mm_set1_epi16((short)factor);
122 lo = _mm_mullo_epi16(lo, f);
123 hi = _mm_mullo_epi16(hi, f);
124
125// Divide by 256 (>> 8) — logical shift: product is unsigned (channel 0-255 * factor 0-256)
126 lo = _mm_srli_epi16(lo, 8);
127 hi = _mm_srli_epi16(hi, 8);
128
129// Store back
130 __m128i result = _mm_packus_epi16(lo, hi);
131 *p0 = _mm_extract_epi32(result, 0);
132 *p1 = _mm_extract_epi32(result, 1);
133 *p2 = _mm_extract_epi32(result, 2);
134 *p3 = _mm_extract_epi32(result, 3);
135}
136
142inline void vignetteSSE4(uint32_t * frame, size_t width, size_t height, int inner_radius, int outer_radius)
143{
144 int w = (int) width;
145 int h = (int) height;
146 int cx = w >> 1;
147 int cy = h >> 1;
148
149 int inner2 = inner_radius * inner_radius;
150 int outer2 = outer_radius * outer_radius;
151 int range2 = outer2 - inner2;
152 if (range2 <= 0) return;
153
154 for (int y = 0; y < h / 2; y++) {
155 int dy = y - cy;
156 int dy2 = dy * dy;
157 int ym = h - 1 - y;
158
159 for (int x = 0; x < w / 2; x++) {
160 int dx = x - cx;
161 int dist2 = dx * dx + dy2;
162 if (dist2 <= inner2) continue;
163
164 int num = outer2 - dist2;
165 if (num < 0) num = 0;
166 uint16_t factor = (uint16_t) ((num << 8) / range2);
167
168 // 4 symmetric pixels
169 int xm = w - 1 - x;
170 size_t i1 = (size_t) y * width + x;
171 size_t i2 = (size_t) y * width + xm;
172 size_t i3 = (size_t) ym * width + x;
173 size_t i4 = (size_t) ym * width + xm;
174 darken4SSE4(&frame[i1], &frame[i2], &frame[i3], &frame[i4], factor);
175 }
176 }
177}
178
179/*****************************************************************************/
184// ks = {kr, kg, kb}; applies per-channel smoothstep: q = c*k, c = 3q² - 2q³
185inline void gammaSSE4(uint32_t * frame, size_t width, size_t height, float ks[3])
186{
187 const __m128 vkr = _mm_set1_ps(ks[0]);
188 const __m128 vkg = _mm_set1_ps(ks[1]);
189 const __m128 vkb = _mm_set1_ps(ks[2]);
190 const __m128 vinv255 = _mm_set1_ps(1.0f / 255.0f);
191 const __m128 v255 = _mm_set1_ps(255.0f);
192 const __m128 v3 = _mm_set1_ps(3.0f);
193
194// Gather: collect one channel from the 4 BGRA pixels into low 4 bytes
195 const __m128i gather_r = _mm_set_epi8(-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, 14,10, 6, 2);
196 const __m128i gather_g = _mm_set_epi8(-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, 13, 9, 5, 1);
197 const __m128i gather_b = _mm_set_epi8(-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, 12, 8, 4, 0);
198// Scatter: spread each channel's 4 result bytes back to their pixel positions
199 const __m128i scatter_r = _mm_set_epi8(-1, 3,-1,-1, -1, 2,-1,-1, -1, 1,-1,-1, -1, 0,-1,-1);
200 const __m128i scatter_g = _mm_set_epi8(-1,-1, 3,-1, -1,-1, 2,-1, -1,-1, 1,-1, -1,-1, 0,-1);
201 const __m128i scatter_b = _mm_set_epi8(-1,-1,-1, 3, -1,-1,-1, 2, -1,-1,-1, 1, -1,-1,-1, 0);
202 const __m128i alpha_mask = _mm_set1_epi32((int)0xFF000000);
203
204 const size_t size = width * height;
205 for (size_t i = 0; i + 4 <= size; i += 4) {
206 __m128i pixels = _mm_loadu_si128((__m128i*)&frame[i]);
207
208 // Red: gather → float → smoothstep → pack
209 __m128 qr = _mm_mul_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepu8_epi32(
210 _mm_shuffle_epi8(pixels, gather_r))), vinv255), vkr);
211 __m128 rf = _mm_mul_ps(_mm_mul_ps(qr, qr), _mm_sub_ps(v3, _mm_add_ps(qr, qr)));
212 __m128i ri = _mm_packus_epi16(_mm_packus_epi32(
213 _mm_cvtps_epi32(_mm_mul_ps(rf, v255)), _mm_setzero_si128()),
214 _mm_setzero_si128());
215
216 // Green
217 __m128 qg = _mm_mul_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepu8_epi32(
218 _mm_shuffle_epi8(pixels, gather_g))), vinv255), vkg);
219 __m128 gf = _mm_mul_ps(_mm_mul_ps(qg, qg), _mm_sub_ps(v3, _mm_add_ps(qg, qg)));
220 __m128i gi = _mm_packus_epi16(_mm_packus_epi32(
221 _mm_cvtps_epi32(_mm_mul_ps(gf, v255)), _mm_setzero_si128()),
222 _mm_setzero_si128());
223
224 // Blue
225 __m128 qb = _mm_mul_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepu8_epi32(
226 _mm_shuffle_epi8(pixels, gather_b))), vinv255), vkb);
227 __m128 bf = _mm_mul_ps(_mm_mul_ps(qb, qb), _mm_sub_ps(v3, _mm_add_ps(qb, qb)));
228 __m128i bi = _mm_packus_epi16(_mm_packus_epi32(
229 _mm_cvtps_epi32(_mm_mul_ps(bf, v255)), _mm_setzero_si128()),
230 _mm_setzero_si128());
231
232 // Reassemble: preserve alpha, scatter R/G/B back to BGRA positions
233 __m128i out = _mm_and_si128(pixels, alpha_mask);
234 out = _mm_or_si128(out, _mm_shuffle_epi8(ri, scatter_r));
235 out = _mm_or_si128(out, _mm_shuffle_epi8(gi, scatter_g));
236 out = _mm_or_si128(out, _mm_shuffle_epi8(bi, scatter_b));
237 _mm_storeu_si128((__m128i*)&frame[i], out);
238 }
239}
240
241#endif // POSTFX_H
void fadeToSSE4(uint32_t *frame, size_t size, uint32_t color, uint16_t factor)
Fade the frame towards a solid color.
Definition postfx.h:70
void gammaSSE4(uint32_t *frame, size_t width, size_t height, float ks[3])
Apply a per-channel smoothstep tone curve.
Definition postfx.h:185
void darken4SSE4(uint32_t *p0, uint32_t *p1, uint32_t *p2, uint32_t *p3, uint16_t factor)
Scale 4 scattered pixels by a common factor.
Definition postfx.h:111
void motionBlurSSE4(uint32_t *frame, uint32_t *previous, uint16_t blend, size_t size)
Blend the previous frame into the current one (motion blur).
Definition postfx.h:27
void vignetteSSE4(uint32_t *frame, size_t width, size_t height, int inner_radius, int outer_radius)
Apply a circular vignette (darkened corners).
Definition postfx.h:142