Skip to content

Commit 25ae464

Browse files
committed
Make SSE4 optional
1 parent 890865c commit 25ae464

File tree

1 file changed

+134
-30
lines changed

1 file changed

+134
-30
lines changed

src/libImaging/BoxBlur.c

Lines changed: 134 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@
33
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
44
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
55

6-
void static inline ImagingLineBoxBlur32(
6+
typedef UINT8 pixel[4];
7+
8+
#if defined(__SSE4__)
9+
10+
void static inline ImagingLineBoxBlur32_SSE4(
711
UINT32 *lineOut,
812
UINT32 *lineIn,
913
int lastx,
@@ -115,7 +119,7 @@ void static inline ImagingLineBoxBlur32(
115119
#undef SAVE
116120
}
117121

118-
void static inline ImagingLineBoxBlurZero32(
122+
void static inline ImagingLineBoxBlurZero32_SSE4(
119123
UINT32 *lineOut,
120124
UINT32 *lineIn,
121125
int lastx,
@@ -177,6 +181,105 @@ void static inline ImagingLineBoxBlurZero32(
177181
#undef SAVE
178182
}
179183

184+
#endif // defined(__SSE4__)
185+
186+
void static inline ImagingLineBoxBlur32(
187+
pixel *lineOut,
188+
pixel *lineIn,
189+
int lastx,
190+
int radius,
191+
int edgeA,
192+
int edgeB,
193+
UINT32 ww,
194+
UINT32 fw
195+
) {
196+
int x;
197+
UINT32 acc[4];
198+
UINT32 bulk[4];
199+
200+
#define MOVE_ACC(acc, subtract, add) \
201+
acc[0] += lineIn[add][0] - lineIn[subtract][0]; \
202+
acc[1] += lineIn[add][1] - lineIn[subtract][1]; \
203+
acc[2] += lineIn[add][2] - lineIn[subtract][2]; \
204+
acc[3] += lineIn[add][3] - lineIn[subtract][3];
205+
206+
#define ADD_FAR(bulk, acc, left, right) \
207+
bulk[0] = (acc[0] * ww) + (lineIn[left][0] + lineIn[right][0]) * fw; \
208+
bulk[1] = (acc[1] * ww) + (lineIn[left][1] + lineIn[right][1]) * fw; \
209+
bulk[2] = (acc[2] * ww) + (lineIn[left][2] + lineIn[right][2]) * fw; \
210+
bulk[3] = (acc[3] * ww) + (lineIn[left][3] + lineIn[right][3]) * fw;
211+
212+
#define SAVE(x, bulk) \
213+
lineOut[x][0] = (UINT8)((bulk[0] + (1 << 23)) >> 24); \
214+
lineOut[x][1] = (UINT8)((bulk[1] + (1 << 23)) >> 24); \
215+
lineOut[x][2] = (UINT8)((bulk[2] + (1 << 23)) >> 24); \
216+
lineOut[x][3] = (UINT8)((bulk[3] + (1 << 23)) >> 24);
217+
218+
/* Compute acc for -1 pixel (outside of image):
219+
From "-radius-1" to "-1" get first pixel,
220+
then from "0" to "radius-1". */
221+
acc[0] = lineIn[0][0] * (radius + 1);
222+
acc[1] = lineIn[0][1] * (radius + 1);
223+
acc[2] = lineIn[0][2] * (radius + 1);
224+
acc[3] = lineIn[0][3] * (radius + 1);
225+
/* As radius can be bigger than xsize, iterate to edgeA -1. */
226+
for (x = 0; x < edgeA - 1; x++) {
227+
acc[0] += lineIn[x][0];
228+
acc[1] += lineIn[x][1];
229+
acc[2] += lineIn[x][2];
230+
acc[3] += lineIn[x][3];
231+
}
232+
/* Then multiply remainder to last x. */
233+
acc[0] += lineIn[lastx][0] * (radius - edgeA + 1);
234+
acc[1] += lineIn[lastx][1] * (radius - edgeA + 1);
235+
acc[2] += lineIn[lastx][2] * (radius - edgeA + 1);
236+
acc[3] += lineIn[lastx][3] * (radius - edgeA + 1);
237+
238+
if (edgeA <= edgeB) {
239+
/* Subtract pixel from left ("0").
240+
Add pixels from radius. */
241+
for (x = 0; x < edgeA; x++) {
242+
MOVE_ACC(acc, 0, x + radius);
243+
ADD_FAR(bulk, acc, 0, x + radius + 1);
244+
SAVE(x, bulk);
245+
}
246+
/* Subtract previous pixel from "-radius".
247+
Add pixels from radius. */
248+
for (x = edgeA; x < edgeB; x++) {
249+
MOVE_ACC(acc, x - radius - 1, x + radius);
250+
ADD_FAR(bulk, acc, x - radius - 1, x + radius + 1);
251+
SAVE(x, bulk);
252+
}
253+
/* Subtract previous pixel from "-radius".
254+
Add last pixel. */
255+
for (x = edgeB; x <= lastx; x++) {
256+
MOVE_ACC(acc, x - radius - 1, lastx);
257+
ADD_FAR(bulk, acc, x - radius - 1, lastx);
258+
SAVE(x, bulk);
259+
}
260+
} else {
261+
for (x = 0; x < edgeB; x++) {
262+
MOVE_ACC(acc, 0, x + radius);
263+
ADD_FAR(bulk, acc, 0, x + radius + 1);
264+
SAVE(x, bulk);
265+
}
266+
for (x = edgeB; x < edgeA; x++) {
267+
MOVE_ACC(acc, 0, lastx);
268+
ADD_FAR(bulk, acc, 0, lastx);
269+
SAVE(x, bulk);
270+
}
271+
for (x = edgeA; x <= lastx; x++) {
272+
MOVE_ACC(acc, x - radius - 1, lastx);
273+
ADD_FAR(bulk, acc, x - radius - 1, lastx);
274+
SAVE(x, bulk);
275+
}
276+
}
277+
278+
#undef MOVE_ACC
279+
#undef ADD_FAR
280+
#undef SAVE
281+
}
282+
180283
void static inline ImagingLineBoxBlur8(
181284
UINT8 *lineOut,
182285
UINT8 *lineIn,
@@ -302,8 +405,8 @@ ImagingHorizontalBoxBlur(Imaging imOut, Imaging imIn, float floatRadius) {
302405
ImagingSectionEnter(&cookie);
303406

304407
if (imIn->image8) {
305-
if (radius) {
306-
for (y = 0; y < imIn->ysize; y++) {
408+
for (y = 0; y < imIn->ysize; y++) {
409+
if (radius) {
307410
ImagingLineBoxBlur8(
308411
(imIn == imOut ? (UINT8 *)lineOut : imOut->image8[y]),
309412
imIn->image8[y],
@@ -314,13 +417,7 @@ ImagingHorizontalBoxBlur(Imaging imOut, Imaging imIn, float floatRadius) {
314417
ww,
315418
fw
316419
);
317-
if (imIn == imOut) {
318-
// Commit.
319-
memcpy(imOut->image8[y], lineOut, imIn->xsize);
320-
}
321-
}
322-
} else {
323-
for (y = 0; y < imIn->ysize; y++) {
420+
} else {
324421
ImagingLineBoxBlurZero8(
325422
(imIn == imOut ? (UINT8 *)lineOut : imOut->image8[y]),
326423
imIn->image8[y],
@@ -330,16 +427,17 @@ ImagingHorizontalBoxBlur(Imaging imOut, Imaging imIn, float floatRadius) {
330427
ww,
331428
fw
332429
);
333-
if (imIn == imOut) {
334-
// Commit.
335-
memcpy(imOut->image8[y], lineOut, imIn->xsize);
336-
}
430+
}
431+
if (imIn == imOut) {
432+
// Commit.
433+
memcpy(imOut->image8[y], lineOut, imIn->xsize);
337434
}
338435
}
339436
} else {
340-
if (radius) {
341-
for (y = 0; y < imIn->ysize; y++) {
342-
ImagingLineBoxBlur32(
437+
for (y = 0; y < imIn->ysize; y++) {
438+
#if defined(__SSE4__)
439+
if (radius) {
440+
ImagingLineBoxBlur32_SSE4(
343441
imIn == imOut ? (UINT32 *)lineOut : (UINT32 *)imOut->image32[y],
344442
(UINT32 *)imIn->image32[y],
345443
imIn->xsize - 1,
@@ -349,14 +447,8 @@ ImagingHorizontalBoxBlur(Imaging imOut, Imaging imIn, float floatRadius) {
349447
ww,
350448
fw
351449
);
352-
if (imIn == imOut) {
353-
// Commit.
354-
memcpy(imOut->image32[y], lineOut, imIn->xsize * 4);
355-
}
356-
}
357-
} else {
358-
for (y = 0; y < imIn->ysize; y++) {
359-
ImagingLineBoxBlurZero32(
450+
} else {
451+
ImagingLineBoxBlurZero32_SSE4(
360452
imIn == imOut ? (UINT32 *)lineOut : (UINT32 *)imOut->image32[y],
361453
(UINT32 *)imIn->image32[y],
362454
imIn->xsize - 1,
@@ -365,10 +457,22 @@ ImagingHorizontalBoxBlur(Imaging imOut, Imaging imIn, float floatRadius) {
365457
ww,
366458
fw
367459
);
368-
if (imIn == imOut) {
369-
// Commit.
370-
memcpy(imOut->image32[y], lineOut, imIn->xsize * 4);
371-
}
460+
}
461+
#else // defined(__SSE4__)
462+
ImagingLineBoxBlur32(
463+
imIn == imOut ? (pixel *)lineOut : (pixel *)imOut->image32[y],
464+
(pixel *)imIn->image32[y],
465+
imIn->xsize - 1,
466+
radius,
467+
edgeA,
468+
edgeB,
469+
ww,
470+
fw
471+
);
472+
#endif // defined(__SSE4__)
473+
if (imIn == imOut) {
474+
// Commit.
475+
memcpy(imOut->image32[y], lineOut, imIn->xsize * 4);
372476
}
373477
}
374478
}

0 commit comments

Comments
 (0)