33#define MAX (x , y ) (((x) > (y)) ? (x) : (y))
44#define MIN (x , y ) (((x) < (y)) ? (x) : (y))
55
6- void static inline ImagingLineBoxBlur32 (
6+ typedef UINT8 pixel [4 ];
7+
8+ #if defined(__SSE4__ )
9+
10+ void static inline ImagingLineBoxBlur32_SSE4 (
711 UINT32 * lineOut ,
812 UINT32 * lineIn ,
913 int lastx ,
@@ -115,7 +119,7 @@ void static inline ImagingLineBoxBlur32(
115119#undef SAVE
116120}
117121
118- void static inline ImagingLineBoxBlurZero32 (
122+ void static inline ImagingLineBoxBlurZero32_SSE4 (
119123 UINT32 * lineOut ,
120124 UINT32 * lineIn ,
121125 int lastx ,
@@ -177,6 +181,105 @@ void static inline ImagingLineBoxBlurZero32(
177181#undef SAVE
178182}
179183
184+ #endif // defined(__SSE4__)
185+
186+ void static inline ImagingLineBoxBlur32 (
187+ pixel * lineOut ,
188+ pixel * lineIn ,
189+ int lastx ,
190+ int radius ,
191+ int edgeA ,
192+ int edgeB ,
193+ UINT32 ww ,
194+ UINT32 fw
195+ ) {
196+ int x ;
197+ UINT32 acc [4 ];
198+ UINT32 bulk [4 ];
199+
200+ #define MOVE_ACC (acc , subtract , add ) \
201+ acc[0] += lineIn[add][0] - lineIn[subtract][0]; \
202+ acc[1] += lineIn[add][1] - lineIn[subtract][1]; \
203+ acc[2] += lineIn[add][2] - lineIn[subtract][2]; \
204+ acc[3] += lineIn[add][3] - lineIn[subtract][3];
205+
206+ #define ADD_FAR (bulk , acc , left , right ) \
207+ bulk[0] = (acc[0] * ww) + (lineIn[left][0] + lineIn[right][0]) * fw; \
208+ bulk[1] = (acc[1] * ww) + (lineIn[left][1] + lineIn[right][1]) * fw; \
209+ bulk[2] = (acc[2] * ww) + (lineIn[left][2] + lineIn[right][2]) * fw; \
210+ bulk[3] = (acc[3] * ww) + (lineIn[left][3] + lineIn[right][3]) * fw;
211+
212+ #define SAVE (x , bulk ) \
213+ lineOut[x][0] = (UINT8)((bulk[0] + (1 << 23)) >> 24); \
214+ lineOut[x][1] = (UINT8)((bulk[1] + (1 << 23)) >> 24); \
215+ lineOut[x][2] = (UINT8)((bulk[2] + (1 << 23)) >> 24); \
216+ lineOut[x][3] = (UINT8)((bulk[3] + (1 << 23)) >> 24);
217+
218+ /* Compute acc for -1 pixel (outside of image):
219+ From "-radius-1" to "-1" get first pixel,
220+ then from "0" to "radius-1". */
221+ acc [0 ] = lineIn [0 ][0 ] * (radius + 1 );
222+ acc [1 ] = lineIn [0 ][1 ] * (radius + 1 );
223+ acc [2 ] = lineIn [0 ][2 ] * (radius + 1 );
224+ acc [3 ] = lineIn [0 ][3 ] * (radius + 1 );
225+ /* As radius can be bigger than xsize, iterate to edgeA -1. */
226+ for (x = 0 ; x < edgeA - 1 ; x ++ ) {
227+ acc [0 ] += lineIn [x ][0 ];
228+ acc [1 ] += lineIn [x ][1 ];
229+ acc [2 ] += lineIn [x ][2 ];
230+ acc [3 ] += lineIn [x ][3 ];
231+ }
232+ /* Then multiply remainder to last x. */
233+ acc [0 ] += lineIn [lastx ][0 ] * (radius - edgeA + 1 );
234+ acc [1 ] += lineIn [lastx ][1 ] * (radius - edgeA + 1 );
235+ acc [2 ] += lineIn [lastx ][2 ] * (radius - edgeA + 1 );
236+ acc [3 ] += lineIn [lastx ][3 ] * (radius - edgeA + 1 );
237+
238+ if (edgeA <= edgeB ) {
239+ /* Subtract pixel from left ("0").
240+ Add pixels from radius. */
241+ for (x = 0 ; x < edgeA ; x ++ ) {
242+ MOVE_ACC (acc , 0 , x + radius );
243+ ADD_FAR (bulk , acc , 0 , x + radius + 1 );
244+ SAVE (x , bulk );
245+ }
246+ /* Subtract previous pixel from "-radius".
247+ Add pixels from radius. */
248+ for (x = edgeA ; x < edgeB ; x ++ ) {
249+ MOVE_ACC (acc , x - radius - 1 , x + radius );
250+ ADD_FAR (bulk , acc , x - radius - 1 , x + radius + 1 );
251+ SAVE (x , bulk );
252+ }
253+ /* Subtract previous pixel from "-radius".
254+ Add last pixel. */
255+ for (x = edgeB ; x <= lastx ; x ++ ) {
256+ MOVE_ACC (acc , x - radius - 1 , lastx );
257+ ADD_FAR (bulk , acc , x - radius - 1 , lastx );
258+ SAVE (x , bulk );
259+ }
260+ } else {
261+ for (x = 0 ; x < edgeB ; x ++ ) {
262+ MOVE_ACC (acc , 0 , x + radius );
263+ ADD_FAR (bulk , acc , 0 , x + radius + 1 );
264+ SAVE (x , bulk );
265+ }
266+ for (x = edgeB ; x < edgeA ; x ++ ) {
267+ MOVE_ACC (acc , 0 , lastx );
268+ ADD_FAR (bulk , acc , 0 , lastx );
269+ SAVE (x , bulk );
270+ }
271+ for (x = edgeA ; x <= lastx ; x ++ ) {
272+ MOVE_ACC (acc , x - radius - 1 , lastx );
273+ ADD_FAR (bulk , acc , x - radius - 1 , lastx );
274+ SAVE (x , bulk );
275+ }
276+ }
277+
278+ #undef MOVE_ACC
279+ #undef ADD_FAR
280+ #undef SAVE
281+ }
282+
180283void static inline ImagingLineBoxBlur8 (
181284 UINT8 * lineOut ,
182285 UINT8 * lineIn ,
@@ -302,8 +405,8 @@ ImagingHorizontalBoxBlur(Imaging imOut, Imaging imIn, float floatRadius) {
302405 ImagingSectionEnter (& cookie );
303406
304407 if (imIn -> image8 ) {
305- if ( radius ) {
306- for ( y = 0 ; y < imIn -> ysize ; y ++ ) {
408+ for ( y = 0 ; y < imIn -> ysize ; y ++ ) {
409+ if ( radius ) {
307410 ImagingLineBoxBlur8 (
308411 (imIn == imOut ? (UINT8 * )lineOut : imOut -> image8 [y ]),
309412 imIn -> image8 [y ],
@@ -314,13 +417,7 @@ ImagingHorizontalBoxBlur(Imaging imOut, Imaging imIn, float floatRadius) {
314417 ww ,
315418 fw
316419 );
317- if (imIn == imOut ) {
318- // Commit.
319- memcpy (imOut -> image8 [y ], lineOut , imIn -> xsize );
320- }
321- }
322- } else {
323- for (y = 0 ; y < imIn -> ysize ; y ++ ) {
420+ } else {
324421 ImagingLineBoxBlurZero8 (
325422 (imIn == imOut ? (UINT8 * )lineOut : imOut -> image8 [y ]),
326423 imIn -> image8 [y ],
@@ -330,16 +427,17 @@ ImagingHorizontalBoxBlur(Imaging imOut, Imaging imIn, float floatRadius) {
330427 ww ,
331428 fw
332429 );
333- if ( imIn == imOut ) {
334- // Commit.
335- memcpy ( imOut -> image8 [ y ], lineOut , imIn -> xsize );
336- }
430+ }
431+ if ( imIn == imOut ) {
432+ // Commit.
433+ memcpy ( imOut -> image8 [ y ], lineOut , imIn -> xsize );
337434 }
338435 }
339436 } else {
340- if (radius ) {
341- for (y = 0 ; y < imIn -> ysize ; y ++ ) {
342- ImagingLineBoxBlur32 (
437+ for (y = 0 ; y < imIn -> ysize ; y ++ ) {
438+ #if defined(__SSE4__ )
439+ if (radius ) {
440+ ImagingLineBoxBlur32_SSE4 (
343441 imIn == imOut ? (UINT32 * )lineOut : (UINT32 * )imOut -> image32 [y ],
344442 (UINT32 * )imIn -> image32 [y ],
345443 imIn -> xsize - 1 ,
@@ -349,14 +447,8 @@ ImagingHorizontalBoxBlur(Imaging imOut, Imaging imIn, float floatRadius) {
349447 ww ,
350448 fw
351449 );
352- if (imIn == imOut ) {
353- // Commit.
354- memcpy (imOut -> image32 [y ], lineOut , imIn -> xsize * 4 );
355- }
356- }
357- } else {
358- for (y = 0 ; y < imIn -> ysize ; y ++ ) {
359- ImagingLineBoxBlurZero32 (
450+ } else {
451+ ImagingLineBoxBlurZero32_SSE4 (
360452 imIn == imOut ? (UINT32 * )lineOut : (UINT32 * )imOut -> image32 [y ],
361453 (UINT32 * )imIn -> image32 [y ],
362454 imIn -> xsize - 1 ,
@@ -365,10 +457,22 @@ ImagingHorizontalBoxBlur(Imaging imOut, Imaging imIn, float floatRadius) {
365457 ww ,
366458 fw
367459 );
368- if (imIn == imOut ) {
369- // Commit.
370- memcpy (imOut -> image32 [y ], lineOut , imIn -> xsize * 4 );
371- }
460+ }
461+ #else // defined(__SSE4__)
462+ ImagingLineBoxBlur32 (
463+ imIn == imOut ? (pixel * )lineOut : (pixel * )imOut -> image32 [y ],
464+ (pixel * )imIn -> image32 [y ],
465+ imIn -> xsize - 1 ,
466+ radius ,
467+ edgeA ,
468+ edgeB ,
469+ ww ,
470+ fw
471+ );
472+ #endif // defined(__SSE4__)
473+ if (imIn == imOut ) {
474+ // Commit.
475+ memcpy (imOut -> image32 [y ], lineOut , imIn -> xsize * 4 );
372476 }
373477 }
374478 }
0 commit comments