From 543cfe2dbaa32d0b1d38cb7cfc6d7bbc93bcfb00 Mon Sep 17 00:00:00 2001 From: Alexander Date: Sun, 5 Jan 2020 15:11:10 +0300 Subject: [PATCH] SIMD Reduce. sse4 implementation --- src/libImaging/Reduce.c | 1174 +++++++++++---------------------------- 1 file changed, 325 insertions(+), 849 deletions(-) diff --git a/src/libImaging/Reduce.c b/src/libImaging/Reduce.c index 60928d2bc36..9672ff2166e 100644 --- a/src/libImaging/Reduce.c +++ b/src/libImaging/Reduce.c @@ -18,6 +18,8 @@ ImagingReduceNxN(Imaging imOut, Imaging imIn, int box[4], int xscale, int yscale int x, y, xx, yy; UINT32 multiplier = division_UINT32(yscale * xscale, 8); UINT32 amend = yscale * xscale / 2; + __m128i mm_multiplier = _mm_set1_epi32(multiplier); + __m128i mm_amend = _mm_set1_epi32(amend); if (imIn->image8) { for (y = 0; y < box[3] / yscale; y++) { @@ -50,130 +52,54 @@ ImagingReduceNxN(Imaging imOut, Imaging imIn, int box[4], int xscale, int yscale } } else { for (y = 0; y < box[3] / yscale; y++) { - int yy_from = box[1] + y * yscale; - if (imIn->bands == 2) { - for (x = 0; x < box[2] / xscale; x++) { - int xx_from = box[0] + x * xscale; - UINT32 v; - UINT32 ss0 = amend, ss3 = amend; - for (yy = yy_from; yy < yy_from + yscale - 1; yy += 2) { - UINT8 *line0 = (UINT8 *)imIn->image[yy]; - UINT8 *line1 = (UINT8 *)imIn->image[yy + 1]; - for (xx = xx_from; xx < xx_from + xscale - 1; xx += 2) { - ss0 += line0[xx * 4 + 0] + line0[xx * 4 + 4] + - line1[xx * 4 + 0] + line1[xx * 4 + 4]; - ss3 += line0[xx * 4 + 3] + line0[xx * 4 + 7] + - line1[xx * 4 + 3] + line1[xx * 4 + 7]; - } - if (xscale & 0x01) { - ss0 += line0[xx * 4 + 0] + line1[xx * 4 + 0]; - ss3 += line0[xx * 4 + 3] + line1[xx * 4 + 3]; - } - } - if (yscale & 0x01) { - UINT8 *line = (UINT8 *)imIn->image[yy]; - for (xx = xx_from; xx < xx_from + xscale - 1; xx += 2) { - ss0 += line[xx * 4 + 0] + line[xx * 4 + 4]; - ss3 += line[xx * 4 + 3] + line[xx * 4 + 7]; - } - if (xscale & 0x01) { - ss0 += line[xx * 4 + 0]; - ss3 += line[xx * 4 + 3]; - } - } - v = MAKE_UINT32( - (ss0 * multiplier) >> 24, 0, 0, (ss3 * multiplier) >> 24); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } - } else if (imIn->bands == 3) { - for (x = 0; x < box[2] / xscale; x++) { - int xx_from = box[0] + x * xscale; - UINT32 v; - UINT32 ss0 = amend, ss1 = amend, ss2 = amend; - for (yy = yy_from; yy < yy_from + yscale - 1; yy += 2) { - UINT8 *line0 = (UINT8 *)imIn->image[yy]; - UINT8 *line1 = (UINT8 *)imIn->image[yy + 1]; - for (xx = xx_from; xx < xx_from + xscale - 1; xx += 2) { - ss0 += line0[xx * 4 + 0] + line0[xx * 4 + 4] + - line1[xx * 4 + 0] + line1[xx * 4 + 4]; - ss1 += line0[xx * 4 + 1] + line0[xx * 4 + 5] + - line1[xx * 4 + 1] + line1[xx * 4 + 5]; - ss2 += line0[xx * 4 + 2] + line0[xx * 4 + 6] + - line1[xx * 4 + 2] + line1[xx * 4 + 6]; - } - if (xscale & 0x01) { - ss0 += line0[xx * 4 + 0] + line1[xx * 4 + 0]; - ss1 += line0[xx * 4 + 1] + line1[xx * 4 + 1]; - ss2 += line0[xx * 4 + 2] + line1[xx * 4 + 2]; - } + int yy_from = box[1] + y*yscale; + for (x = 0; x < box[2] / xscale; x++) { + int xx_from = box[0] + x*xscale; + __m128i mm_ss0 = mm_amend; + for (yy = yy_from; yy < yy_from + yscale - 1; yy += 2) { + UINT8 *line0 = (UINT8 *)imIn->image[yy]; + UINT8 *line1 = (UINT8 *)imIn->image[yy + 1]; + for (xx = xx_from; xx < xx_from + xscale - 1; xx += 2) { + __m128i pix0 = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *) &line0[xx*4]), + _mm_setzero_si128()); + __m128i pix1 = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *) &line1[xx*4]), + _mm_setzero_si128()); + pix0 = _mm_add_epi16(pix0, pix1); + pix0 = _mm_add_epi32( + _mm_unpackhi_epi16(pix0, _mm_setzero_si128()), + _mm_unpacklo_epi16(pix0, _mm_setzero_si128())); + mm_ss0 = _mm_add_epi32(mm_ss0, pix0); } - if (yscale & 0x01) { - UINT8 *line = (UINT8 *)imIn->image[yy]; - for (xx = xx_from; xx < xx_from + xscale - 1; xx += 2) { - ss0 += line[xx * 4 + 0] + line[xx * 4 + 4]; - ss1 += line[xx * 4 + 1] + line[xx * 4 + 5]; - ss2 += line[xx * 4 + 2] + line[xx * 4 + 6]; - } - if (xscale & 0x01) { - ss0 += line[xx * 4 + 0]; - ss1 += line[xx * 4 + 1]; - ss2 += line[xx * 4 + 2]; - } + if (xscale & 0x01) { + __m128i pix0 = mm_cvtepu8_epi32(&line0[xx*4]); + __m128i pix1 = mm_cvtepu8_epi32(&line1[xx*4]); + mm_ss0 = _mm_add_epi32(mm_ss0, pix0); + mm_ss0 = _mm_add_epi32(mm_ss0, pix1); } - v = MAKE_UINT32( - (ss0 * multiplier) >> 24, - (ss1 * multiplier) >> 24, - (ss2 * multiplier) >> 24, - 0); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); } - } else { // bands == 4 - for (x = 0; x < box[2] / xscale; x++) { - int xx_from = box[0] + x * xscale; - UINT32 v; - UINT32 ss0 = amend, ss1 = amend, ss2 = amend, ss3 = amend; - for (yy = yy_from; yy < yy_from + yscale - 1; yy += 2) { - UINT8 *line0 = (UINT8 *)imIn->image[yy]; - UINT8 *line1 = (UINT8 *)imIn->image[yy + 1]; - for (xx = xx_from; xx < xx_from + xscale - 1; xx += 2) { - ss0 += line0[xx * 4 + 0] + line0[xx * 4 + 4] + - line1[xx * 4 + 0] + line1[xx * 4 + 4]; - ss1 += line0[xx * 4 + 1] + line0[xx * 4 + 5] + - line1[xx * 4 + 1] + line1[xx * 4 + 5]; - ss2 += line0[xx * 4 + 2] + line0[xx * 4 + 6] + - line1[xx * 4 + 2] + line1[xx * 4 + 6]; - ss3 += line0[xx * 4 + 3] + line0[xx * 4 + 7] + - line1[xx * 4 + 3] + line1[xx * 4 + 7]; - } - if (xscale & 0x01) { - ss0 += line0[xx * 4 + 0] + line1[xx * 4 + 0]; - ss1 += line0[xx * 4 + 1] + line1[xx * 4 + 1]; - ss2 += line0[xx * 4 + 2] + line1[xx * 4 + 2]; - ss3 += line0[xx * 4 + 3] + line1[xx * 4 + 3]; - } + if (yscale & 0x01) { + UINT8 *line = (UINT8 *)imIn->image[yy]; + for (xx = xx_from; xx < xx_from + xscale - 1; xx += 2) { + __m128i pix0 = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *) &line[xx*4]), + _mm_setzero_si128()); + pix0 = _mm_add_epi32( + _mm_unpackhi_epi16(pix0, _mm_setzero_si128()), + _mm_unpacklo_epi16(pix0, _mm_setzero_si128())); + mm_ss0 = _mm_add_epi32(mm_ss0, pix0); } - if (yscale & 0x01) { - UINT8 *line = (UINT8 *)imIn->image[yy]; - for (xx = xx_from; xx < xx_from + xscale - 1; xx += 2) { - ss0 += line[xx * 4 + 0] + line[xx * 4 + 4]; - ss1 += line[xx * 4 + 1] + line[xx * 4 + 5]; - ss2 += line[xx * 4 + 2] + line[xx * 4 + 6]; - ss3 += line[xx * 4 + 3] + line[xx * 4 + 7]; - } - if (xscale & 0x01) { - ss0 += line[xx * 4 + 0]; - ss1 += line[xx * 4 + 1]; - ss2 += line[xx * 4 + 2]; - ss3 += line[xx * 4 + 3]; - } + if (xscale & 0x01) { + __m128i pix0 = mm_cvtepu8_epi32(&line[xx*4]); + mm_ss0 = _mm_add_epi32(mm_ss0, pix0); } - v = MAKE_UINT32( - (ss0 * multiplier) >> 24, - (ss1 * multiplier) >> 24, - (ss2 * multiplier) >> 24, - (ss3 * multiplier) >> 24); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); } + + mm_ss0 = _mm_mullo_epi32(mm_ss0, mm_multiplier); + mm_ss0 = _mm_srli_epi32(mm_ss0, 24); + mm_ss0 = _mm_packs_epi32(mm_ss0, mm_ss0); + ((UINT32 *)imOut->image[y])[x] = _mm_cvtsi128_si32(_mm_packus_epi16(mm_ss0, mm_ss0)); } } } @@ -187,6 +113,8 @@ ImagingReduce1xN(Imaging imOut, Imaging imIn, int box[4], int yscale) { int xscale = 1; UINT32 multiplier = division_UINT32(yscale * xscale, 8); UINT32 amend = yscale * xscale / 2; + __m128i mm_multiplier = _mm_set1_epi32(multiplier); + __m128i mm_amend = _mm_set1_epi32(amend); if (imIn->image8) { for (y = 0; y < box[3] / yscale; y++) { @@ -208,79 +136,60 @@ ImagingReduce1xN(Imaging imOut, Imaging imIn, int box[4], int yscale) { } } else { for (y = 0; y < box[3] / yscale; y++) { - int yy_from = box[1] + y * yscale; - if (imIn->bands == 2) { - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - UINT32 ss0 = amend, ss3 = amend; - for (yy = yy_from; yy < yy_from + yscale - 1; yy += 2) { - UINT8 *line0 = (UINT8 *)imIn->image[yy]; - UINT8 *line1 = (UINT8 *)imIn->image[yy + 1]; - ss0 += line0[xx * 4 + 0] + line1[xx * 4 + 0]; - ss3 += line0[xx * 4 + 3] + line1[xx * 4 + 3]; - } - if (yscale & 0x01) { - UINT8 *line = (UINT8 *)imIn->image[yy]; - ss0 += line[xx * 4 + 0]; - ss3 += line[xx * 4 + 3]; - } - v = MAKE_UINT32( - (ss0 * multiplier) >> 24, 0, 0, (ss3 * multiplier) >> 24); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); + int yy_from = box[1] + y*yscale; + for (x = 0; x < box[2] / xscale - 1; x += 2) { + int xx = box[0] + x*xscale; + __m128i mm_ss0 = mm_amend; + __m128i mm_ss1 = mm_amend; + for (yy = yy_from; yy < yy_from + yscale - 1; yy += 2) { + UINT8 *line0 = (UINT8 *)imIn->image[yy + 0]; + UINT8 *line1 = (UINT8 *)imIn->image[yy + 1]; + __m128i pix0 = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *) &line0[xx*4]), + _mm_setzero_si128()); + __m128i pix1 = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *) &line1[xx*4]), + _mm_setzero_si128()); + pix0 = _mm_add_epi16(pix0, pix1); + mm_ss0 = _mm_add_epi32(mm_ss0, _mm_unpacklo_epi16(pix0, _mm_setzero_si128())); + mm_ss1 = _mm_add_epi32(mm_ss1, _mm_unpackhi_epi16(pix0, _mm_setzero_si128())); } - } else if (imIn->bands == 3) { - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - UINT32 ss0 = amend, ss1 = amend, ss2 = amend; - for (yy = yy_from; yy < yy_from + yscale - 1; yy += 2) { - UINT8 *line0 = (UINT8 *)imIn->image[yy]; - UINT8 *line1 = (UINT8 *)imIn->image[yy + 1]; - ss0 += line0[xx * 4 + 0] + line1[xx * 4 + 0]; - ss1 += line0[xx * 4 + 1] + line1[xx * 4 + 1]; - ss2 += line0[xx * 4 + 2] + line1[xx * 4 + 2]; - } - if (yscale & 0x01) { - UINT8 *line = (UINT8 *)imIn->image[yy]; - ss0 += line[xx * 4 + 0]; - ss1 += line[xx * 4 + 1]; - ss2 += line[xx * 4 + 2]; - } - v = MAKE_UINT32( - (ss0 * multiplier) >> 24, - (ss1 * multiplier) >> 24, - (ss2 * multiplier) >> 24, - 0); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); + if (yscale & 0x01) { + UINT8 *line = (UINT8 *)imIn->image[yy]; + __m128i pix0 = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *) &line[xx*4]), + _mm_setzero_si128()); + mm_ss0 = _mm_add_epi32(mm_ss0, _mm_unpacklo_epi16(pix0, _mm_setzero_si128())); + mm_ss1 = _mm_add_epi32(mm_ss1, _mm_unpackhi_epi16(pix0, _mm_setzero_si128())); + } + + mm_ss0 = _mm_mullo_epi32(mm_ss0, mm_multiplier); + mm_ss1 = _mm_mullo_epi32(mm_ss1, mm_multiplier); + mm_ss0 = _mm_srli_epi32(mm_ss0, 24); + mm_ss1 = _mm_srli_epi32(mm_ss1, 24); + mm_ss0 = _mm_packs_epi32(mm_ss0, mm_ss1); + _mm_storel_epi64((__m128i *)(imOut->image32[y] + x), _mm_packus_epi16(mm_ss0, mm_ss0)); + } + for (; x < box[2] / xscale; x++) { + int xx = box[0] + x*xscale; + __m128i mm_ss0 = mm_amend; + for (yy = yy_from; yy < yy_from + yscale - 1; yy += 2) { + UINT8 *line0 = (UINT8 *)imIn->image[yy + 0]; + UINT8 *line1 = (UINT8 *)imIn->image[yy + 1]; + __m128i pix0 = mm_cvtepu8_epi32(&line0[xx*4]); + __m128i pix1 = mm_cvtepu8_epi32(&line1[xx*4]); + mm_ss0 = _mm_add_epi32(mm_ss0, _mm_add_epi32(pix0, pix1)); } - } else { // bands == 4 - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - UINT32 ss0 = amend, ss1 = amend, ss2 = amend, ss3 = amend; - for (yy = yy_from; yy < yy_from + yscale - 1; yy += 2) { - UINT8 *line0 = (UINT8 *)imIn->image[yy]; - UINT8 *line1 = (UINT8 *)imIn->image[yy + 1]; - ss0 += line0[xx * 4 + 0] + line1[xx * 4 + 0]; - ss1 += line0[xx * 4 + 1] + line1[xx * 4 + 1]; - ss2 += line0[xx * 4 + 2] + line1[xx * 4 + 2]; - ss3 += line0[xx * 4 + 3] + line1[xx * 4 + 3]; - } - if (yscale & 0x01) { - UINT8 *line = (UINT8 *)imIn->image[yy]; - ss0 += line[xx * 4 + 0]; - ss1 += line[xx * 4 + 1]; - ss2 += line[xx * 4 + 2]; - ss3 += line[xx * 4 + 3]; - } - v = MAKE_UINT32( - (ss0 * multiplier) >> 24, - (ss1 * multiplier) >> 24, - (ss2 * multiplier) >> 24, - (ss3 * multiplier) >> 24); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); + if (yscale & 0x01) { + UINT8 *line = (UINT8 *)imIn->image[yy]; + __m128i pix0 = mm_cvtepu8_epi32(&line[xx*4]); + mm_ss0 = _mm_add_epi32(mm_ss0, pix0); } + + mm_ss0 = _mm_mullo_epi32(mm_ss0, mm_multiplier); + mm_ss0 = _mm_srli_epi32(mm_ss0, 24); + mm_ss0 = _mm_packs_epi32(mm_ss0, mm_ss0); + ((UINT32 *)imOut->image[y])[x] = _mm_cvtsi128_si32(_mm_packus_epi16(mm_ss0, mm_ss0)); } } } @@ -294,10 +203,12 @@ ImagingReduceNx1(Imaging imOut, Imaging imIn, int box[4], int xscale) { int yscale = 1; UINT32 multiplier = division_UINT32(yscale * xscale, 8); UINT32 amend = yscale * xscale / 2; + __m128i mm_multiplier = _mm_set1_epi32(multiplier); + __m128i mm_amend = _mm_set1_epi32(amend); if (imIn->image8) { for (y = 0; y < box[3] / yscale; y++) { - int yy = box[1] + y * yscale; + int yy = box[1] + y*yscale; UINT8 *line = (UINT8 *)imIn->image8[yy]; for (x = 0; x < box[2] / xscale; x++) { int xx_from = box[0] + x * xscale; @@ -312,72 +223,71 @@ ImagingReduceNx1(Imaging imOut, Imaging imIn, int box[4], int xscale) { } } } else { - for (y = 0; y < box[3] / yscale; y++) { - int yy = box[1] + y * yscale; - UINT8 *line = (UINT8 *)imIn->image[yy]; - if (imIn->bands == 2) { - for (x = 0; x < box[2] / xscale; x++) { - int xx_from = box[0] + x * xscale; - UINT32 v; - UINT32 ss0 = amend, ss3 = amend; - for (xx = xx_from; xx < xx_from + xscale - 1; xx += 2) { - ss0 += line[xx * 4 + 0] + line[xx * 4 + 4]; - ss3 += line[xx * 4 + 3] + line[xx * 4 + 7]; - } - if (xscale & 0x01) { - ss0 += line[xx * 4 + 0]; - ss3 += line[xx * 4 + 3]; - } - v = MAKE_UINT32( - (ss0 * multiplier) >> 24, 0, 0, (ss3 * multiplier) >> 24); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); + for (y = 0; y < box[3] / yscale - 1; y += 2) { + int yy = box[1] + y*yscale; + UINT8 *line0 = (UINT8 *)imIn->image[yy + 0]; + UINT8 *line1 = (UINT8 *)imIn->image[yy + 1]; + for (x = 0; x < box[2] / xscale; x++) { + int xx_from = box[0] + x*xscale; + __m128i mm_ss0 = mm_amend; + __m128i mm_ss1 = mm_amend; + for (xx = xx_from; xx < xx_from + xscale - 1; xx += 2) { + __m128i pix0 = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *) &line0[xx*4]), + _mm_setzero_si128()); + __m128i pix1 = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *) &line1[xx*4]), + _mm_setzero_si128()); + pix0 = _mm_add_epi32( + _mm_unpacklo_epi16(pix0, _mm_setzero_si128()), + _mm_unpackhi_epi16(pix0, _mm_setzero_si128())); + pix1 = _mm_add_epi32( + _mm_unpacklo_epi16(pix1, _mm_setzero_si128()), + _mm_unpackhi_epi16(pix1, _mm_setzero_si128())); + mm_ss0 = _mm_add_epi32(mm_ss0, pix0); + mm_ss1 = _mm_add_epi32(mm_ss1, pix1); } - } else if (imIn->bands == 3) { - for (x = 0; x < box[2] / xscale; x++) { - int xx_from = box[0] + x * xscale; - UINT32 v; - UINT32 ss0 = amend, ss1 = amend, ss2 = amend; - for (xx = xx_from; xx < xx_from + xscale - 1; xx += 2) { - ss0 += line[xx * 4 + 0] + line[xx * 4 + 4]; - ss1 += line[xx * 4 + 1] + line[xx * 4 + 5]; - ss2 += line[xx * 4 + 2] + line[xx * 4 + 6]; - } - if (xscale & 0x01) { - ss0 += line[xx * 4 + 0]; - ss1 += line[xx * 4 + 1]; - ss2 += line[xx * 4 + 2]; - } - v = MAKE_UINT32( - (ss0 * multiplier) >> 24, - (ss1 * multiplier) >> 24, - (ss2 * multiplier) >> 24, - 0); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); + if (xscale & 0x01) { + __m128i pix0 = mm_cvtepu8_epi32(&line0[xx*4]); + __m128i pix1 = mm_cvtepu8_epi32(&line1[xx*4]); + mm_ss0 = _mm_add_epi32(mm_ss0, pix0); + mm_ss1 = _mm_add_epi32(mm_ss1, pix1); } - } else { // bands == 4 - for (x = 0; x < box[2] / xscale; x++) { - int xx_from = box[0] + x * xscale; - UINT32 v; - UINT32 ss0 = amend, ss1 = amend, ss2 = amend, ss3 = amend; - for (xx = xx_from; xx < xx_from + xscale - 1; xx += 2) { - ss0 += line[xx * 4 + 0] + line[xx * 4 + 4]; - ss1 += line[xx * 4 + 1] + line[xx * 4 + 5]; - ss2 += line[xx * 4 + 2] + line[xx * 4 + 6]; - ss3 += line[xx * 4 + 3] + line[xx * 4 + 7]; - } - if (xscale & 0x01) { - ss0 += line[xx * 4 + 0]; - ss1 += line[xx * 4 + 1]; - ss2 += line[xx * 4 + 2]; - ss3 += line[xx * 4 + 3]; - } - v = MAKE_UINT32( - (ss0 * multiplier) >> 24, - (ss1 * multiplier) >> 24, - (ss2 * multiplier) >> 24, - (ss3 * multiplier) >> 24); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); + + mm_ss0 = _mm_mullo_epi32(mm_ss0, mm_multiplier); + mm_ss1 = _mm_mullo_epi32(mm_ss1, mm_multiplier); + mm_ss0 = _mm_srli_epi32(mm_ss0, 24); + mm_ss1 = _mm_srli_epi32(mm_ss1, 24); + mm_ss0 = _mm_packs_epi32(mm_ss0, mm_ss0); + mm_ss1 = _mm_packs_epi32(mm_ss1, mm_ss1); + ((UINT32 *)imOut->image[y + 0])[x] = _mm_cvtsi128_si32(_mm_packus_epi16(mm_ss0, mm_ss0)); + ((UINT32 *)imOut->image[y + 1])[x] = _mm_cvtsi128_si32(_mm_packus_epi16(mm_ss1, mm_ss1)); + } + } + for (; y < box[3] / yscale; y++) { + int yy = box[1] + y*yscale; + UINT8 *line = (UINT8 *)imIn->image[yy]; + for (x = 0; x < box[2] / xscale; x++) { + int xx_from = box[0] + x*xscale; + __m128i mm_ss0 = mm_amend; + for (xx = xx_from; xx < xx_from + xscale - 1; xx += 2) { + __m128i pix0 = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *) &line[xx*4]), + _mm_setzero_si128()); + pix0 = _mm_add_epi32( + _mm_unpacklo_epi16(pix0, _mm_setzero_si128()), + _mm_unpackhi_epi16(pix0, _mm_setzero_si128())); + mm_ss0 = _mm_add_epi32(mm_ss0, pix0); + } + if (xscale & 0x01) { + __m128i pix0 = mm_cvtepu8_epi32(&line[xx*4]); + mm_ss0 = _mm_add_epi32(mm_ss0, pix0); } + + mm_ss0 = _mm_mullo_epi32(mm_ss0, mm_multiplier); + mm_ss0 = _mm_srli_epi32(mm_ss0, 24); + mm_ss0 = _mm_packs_epi32(mm_ss0, mm_ss0); + ((UINT32 *)imOut->image[y])[x] = _mm_cvtsi128_si32(_mm_packus_epi16(mm_ss0, mm_ss0)); } } } @@ -389,8 +299,8 @@ ImagingReduce1x2(Imaging imOut, Imaging imIn, int box[4]) { */ int xscale = 1, yscale = 2; int x, y; - UINT32 ss0, ss1, ss2, ss3; UINT32 amend = yscale * xscale / 2; + __m128i mm_amend = _mm_set1_epi32(amend); if (imIn->image8) { for (y = 0; y < box[3] / yscale; y++) { @@ -398,8 +308,9 @@ ImagingReduce1x2(Imaging imOut, Imaging imIn, int box[4]) { UINT8 *line0 = (UINT8 *)imIn->image8[yy + 0]; UINT8 *line1 = (UINT8 *)imIn->image8[yy + 1]; for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - ss0 = line0[xx + 0] + line1[xx + 0]; + int xx = box[0] + x*xscale; + UINT32 ss0 = line0[xx + 0] + + line1[xx + 0]; imOut->image8[y][x] = (ss0 + amend) >> 1; } } @@ -408,41 +319,17 @@ ImagingReduce1x2(Imaging imOut, Imaging imIn, int box[4]) { int yy = box[1] + y * yscale; UINT8 *line0 = (UINT8 *)imIn->image[yy + 0]; UINT8 *line1 = (UINT8 *)imIn->image[yy + 1]; - if (imIn->bands == 2) { - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line1[xx * 4 + 0]; - ss3 = line0[xx * 4 + 3] + line1[xx * 4 + 3]; - v = MAKE_UINT32((ss0 + amend) >> 1, 0, 0, (ss3 + amend) >> 1); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } - } else if (imIn->bands == 3) { - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line1[xx * 4 + 0]; - ss1 = line0[xx * 4 + 1] + line1[xx * 4 + 1]; - ss2 = line0[xx * 4 + 2] + line1[xx * 4 + 2]; - v = MAKE_UINT32( - (ss0 + amend) >> 1, (ss1 + amend) >> 1, (ss2 + amend) >> 1, 0); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } - } else { // bands == 4 - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line1[xx * 4 + 0]; - ss1 = line0[xx * 4 + 1] + line1[xx * 4 + 1]; - ss2 = line0[xx * 4 + 2] + line1[xx * 4 + 2]; - ss3 = line0[xx * 4 + 3] + line1[xx * 4 + 3]; - v = MAKE_UINT32( - (ss0 + amend) >> 1, - (ss1 + amend) >> 1, - (ss2 + amend) >> 1, - (ss3 + amend) >> 1); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } + for (x = 0; x < box[2] / xscale; x++) { + int xx = box[0] + x*xscale; + __m128i mm_ss0; + + __m128i pix0 = mm_cvtepu8_epi32(&line0[xx*4]); + __m128i pix1 = mm_cvtepu8_epi32(&line1[xx*4]); + mm_ss0 = _mm_add_epi32(pix0, pix1); + mm_ss0 = _mm_add_epi32(mm_ss0, mm_amend); + mm_ss0 = _mm_srli_epi32(mm_ss0, 1); + mm_ss0 = _mm_packs_epi32(mm_ss0, mm_ss0); + ((UINT32 *)imOut->image[y])[x] = _mm_cvtsi128_si32(_mm_packus_epi16(mm_ss0, mm_ss0)); } } } @@ -454,16 +341,16 @@ ImagingReduce2x1(Imaging imOut, Imaging imIn, int box[4]) { */ int xscale = 2, yscale = 1; int x, y; - UINT32 ss0, ss1, ss2, ss3; UINT32 amend = yscale * xscale / 2; + __m128i mm_amend = _mm_set1_epi16(amend); if (imIn->image8) { for (y = 0; y < box[3] / yscale; y++) { int yy = box[1] + y * yscale; UINT8 *line0 = (UINT8 *)imIn->image8[yy + 0]; for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - ss0 = line0[xx + 0] + line0[xx + 1]; + int xx = box[0] + x*xscale; + UINT32 ss0 = line0[xx + 0] + line0[xx + 1]; imOut->image8[y][x] = (ss0 + amend) >> 1; } } @@ -471,41 +358,17 @@ ImagingReduce2x1(Imaging imOut, Imaging imIn, int box[4]) { for (y = 0; y < box[3] / yscale; y++) { int yy = box[1] + y * yscale; UINT8 *line0 = (UINT8 *)imIn->image[yy + 0]; - if (imIn->bands == 2) { - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line0[xx * 4 + 4]; - ss3 = line0[xx * 4 + 3] + line0[xx * 4 + 7]; - v = MAKE_UINT32((ss0 + amend) >> 1, 0, 0, (ss3 + amend) >> 1); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } - } else if (imIn->bands == 3) { - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line0[xx * 4 + 4]; - ss1 = line0[xx * 4 + 1] + line0[xx * 4 + 5]; - ss2 = line0[xx * 4 + 2] + line0[xx * 4 + 6]; - v = MAKE_UINT32( - (ss0 + amend) >> 1, (ss1 + amend) >> 1, (ss2 + amend) >> 1, 0); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } - } else { // bands == 4 - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line0[xx * 4 + 4]; - ss1 = line0[xx * 4 + 1] + line0[xx * 4 + 5]; - ss2 = line0[xx * 4 + 2] + line0[xx * 4 + 6]; - ss3 = line0[xx * 4 + 3] + line0[xx * 4 + 7]; - v = MAKE_UINT32( - (ss0 + amend) >> 1, - (ss1 + amend) >> 1, - (ss2 + amend) >> 1, - (ss3 + amend) >> 1); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } + for (x = 0; x < box[2] / xscale; x++) { + int xx = box[0] + x*xscale; + __m128i mm_ss0; + + __m128i pix0 = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *) &line0[xx*4]), + _mm_setzero_si128()); + pix0 = _mm_add_epi16(_mm_srli_si128(pix0, 8), pix0); + mm_ss0 = _mm_add_epi16(pix0, mm_amend); + mm_ss0 = _mm_srli_epi16(mm_ss0, 1); + ((UINT32 *)imOut->image[y])[x] = _mm_cvtsi128_si32(_mm_packus_epi16(mm_ss0, mm_ss0)); } } } @@ -517,8 +380,8 @@ ImagingReduce2x2(Imaging imOut, Imaging imIn, int box[4]) { */ int xscale = 2, yscale = 2; int x, y; - UINT32 ss0, ss1, ss2, ss3; UINT32 amend = yscale * xscale / 2; + __m128i mm_amend = _mm_set1_epi16(amend); if (imIn->image8) { for (y = 0; y < box[3] / yscale; y++) { @@ -526,8 +389,9 @@ ImagingReduce2x2(Imaging imOut, Imaging imIn, int box[4]) { UINT8 *line0 = (UINT8 *)imIn->image8[yy + 0]; UINT8 *line1 = (UINT8 *)imIn->image8[yy + 1]; for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - ss0 = line0[xx + 0] + line0[xx + 1] + line1[xx + 0] + line1[xx + 1]; + int xx = box[0] + x*xscale; + UINT32 ss0 = line0[xx + 0] + line0[xx + 1] + + line1[xx + 0] + line1[xx + 1]; imOut->image8[y][x] = (ss0 + amend) >> 2; } } @@ -536,50 +400,21 @@ ImagingReduce2x2(Imaging imOut, Imaging imIn, int box[4]) { int yy = box[1] + y * yscale; UINT8 *line0 = (UINT8 *)imIn->image[yy + 0]; UINT8 *line1 = (UINT8 *)imIn->image[yy + 1]; - if (imIn->bands == 2) { - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line0[xx * 4 + 4] + line1[xx * 4 + 0] + - line1[xx * 4 + 4]; - ss3 = line0[xx * 4 + 3] + line0[xx * 4 + 7] + line1[xx * 4 + 3] + - line1[xx * 4 + 7]; - v = MAKE_UINT32((ss0 + amend) >> 2, 0, 0, (ss3 + amend) >> 2); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } - } else if (imIn->bands == 3) { - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line0[xx * 4 + 4] + line1[xx * 4 + 0] + - line1[xx * 4 + 4]; - ss1 = line0[xx * 4 + 1] + line0[xx * 4 + 5] + line1[xx * 4 + 1] + - line1[xx * 4 + 5]; - ss2 = line0[xx * 4 + 2] + line0[xx * 4 + 6] + line1[xx * 4 + 2] + - line1[xx * 4 + 6]; - v = MAKE_UINT32( - (ss0 + amend) >> 2, (ss1 + amend) >> 2, (ss2 + amend) >> 2, 0); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } - } else { // bands == 4 - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line0[xx * 4 + 4] + line1[xx * 4 + 0] + - line1[xx * 4 + 4]; - ss1 = line0[xx * 4 + 1] + line0[xx * 4 + 5] + line1[xx * 4 + 1] + - line1[xx * 4 + 5]; - ss2 = line0[xx * 4 + 2] + line0[xx * 4 + 6] + line1[xx * 4 + 2] + - line1[xx * 4 + 6]; - ss3 = line0[xx * 4 + 3] + line0[xx * 4 + 7] + line1[xx * 4 + 3] + - line1[xx * 4 + 7]; - v = MAKE_UINT32( - (ss0 + amend) >> 2, - (ss1 + amend) >> 2, - (ss2 + amend) >> 2, - (ss3 + amend) >> 2); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } + for (x = 0; x < box[2] / xscale; x++) { + int xx = box[0] + x*xscale; + __m128i mm_ss0; + + __m128i pix0 = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *) &line0[xx*4]), + _mm_setzero_si128()); + __m128i pix1 = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *) &line1[xx*4]), + _mm_setzero_si128()); + pix0 = _mm_add_epi16(pix0, pix1); + pix0 = _mm_add_epi16(_mm_srli_si128(pix0, 8), pix0); + mm_ss0 = _mm_add_epi16(pix0, mm_amend); + mm_ss0 = _mm_srli_epi16(mm_ss0, 2); + ((UINT32 *)imOut->image[y])[x] = _mm_cvtsi128_si32(_mm_packus_epi16(mm_ss0, mm_ss0)); } } } @@ -591,9 +426,10 @@ ImagingReduce1x3(Imaging imOut, Imaging imIn, int box[4]) { */ int xscale = 1, yscale = 3; int x, y; - UINT32 ss0, ss1, ss2, ss3; UINT32 multiplier = division_UINT32(yscale * xscale, 8); UINT32 amend = yscale * xscale / 2; + __m128i mm_multiplier = _mm_set1_epi32(multiplier); + __m128i mm_amend = _mm_set1_epi32(amend); if (imIn->image8) { for (y = 0; y < box[3] / yscale; y++) { @@ -602,8 +438,10 @@ ImagingReduce1x3(Imaging imOut, Imaging imIn, int box[4]) { UINT8 *line1 = (UINT8 *)imIn->image8[yy + 1]; UINT8 *line2 = (UINT8 *)imIn->image8[yy + 2]; for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - ss0 = line0[xx + 0] + line1[xx + 0] + line2[xx + 0]; + int xx = box[0] + x*xscale; + UINT32 ss0 = line0[xx + 0] + + line1[xx + 0] + + line2[xx + 0]; imOut->image8[y][x] = ((ss0 + amend) * multiplier) >> 24; } } @@ -613,48 +451,21 @@ ImagingReduce1x3(Imaging imOut, Imaging imIn, int box[4]) { UINT8 *line0 = (UINT8 *)imIn->image[yy + 0]; UINT8 *line1 = (UINT8 *)imIn->image[yy + 1]; UINT8 *line2 = (UINT8 *)imIn->image[yy + 2]; - if (imIn->bands == 2) { - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line1[xx * 4 + 0] + line2[xx * 4 + 0]; - ss3 = line0[xx * 4 + 3] + line1[xx * 4 + 3] + line2[xx * 4 + 3]; - v = MAKE_UINT32( - ((ss0 + amend) * multiplier) >> 24, - 0, - 0, - ((ss3 + amend) * multiplier) >> 24); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } - } else if (imIn->bands == 3) { - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line1[xx * 4 + 0] + line2[xx * 4 + 0]; - ss1 = line0[xx * 4 + 1] + line1[xx * 4 + 1] + line2[xx * 4 + 1]; - ss2 = line0[xx * 4 + 2] + line1[xx * 4 + 2] + line2[xx * 4 + 2]; - v = MAKE_UINT32( - ((ss0 + amend) * multiplier) >> 24, - ((ss1 + amend) * multiplier) >> 24, - ((ss2 + amend) * multiplier) >> 24, - 0); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } - } else { // bands == 4 - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line1[xx * 4 + 0] + line2[xx * 4 + 0]; - ss1 = line0[xx * 4 + 1] + line1[xx * 4 + 1] + line2[xx * 4 + 1]; - ss2 = line0[xx * 4 + 2] + line1[xx * 4 + 2] + line2[xx * 4 + 2]; - ss3 = line0[xx * 4 + 3] + line1[xx * 4 + 3] + line2[xx * 4 + 3]; - v = MAKE_UINT32( - ((ss0 + amend) * multiplier) >> 24, - ((ss1 + amend) * multiplier) >> 24, - ((ss2 + amend) * multiplier) >> 24, - ((ss3 + amend) * multiplier) >> 24); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } + for (x = 0; x < box[2] / xscale; x++) { + int xx = box[0] + x*xscale; + __m128i mm_ss0 = _mm_add_epi32( + _mm_add_epi32( + mm_cvtepu8_epi32(&line0[xx*4]), + mm_cvtepu8_epi32(&line1[xx*4])), + _mm_add_epi32( + mm_cvtepu8_epi32(&line2[xx*4]), + mm_amend)); + + mm_ss0 = _mm_add_epi32(mm_ss0, mm_amend); + mm_ss0 = _mm_mullo_epi32(mm_ss0, mm_multiplier); + mm_ss0 = _mm_srli_epi32(mm_ss0, 24); + mm_ss0 = _mm_packs_epi32(mm_ss0, mm_ss0); + ((UINT32 *)imOut->image[y])[x] = _mm_cvtsi128_si32(_mm_packus_epi16(mm_ss0, mm_ss0)); } } } @@ -666,17 +477,18 @@ ImagingReduce3x1(Imaging imOut, Imaging imIn, int box[4]) { */ int xscale = 3, yscale = 1; int x, y; - UINT32 ss0, ss1, ss2, ss3; UINT32 multiplier = division_UINT32(yscale * xscale, 8); UINT32 amend = yscale * xscale / 2; + __m128i mm_multiplier = _mm_set1_epi32(multiplier); + __m128i mm_amend = _mm_set1_epi32(amend); if (imIn->image8) { for (y = 0; y < box[3] / yscale; y++) { int yy = box[1] + y * yscale; UINT8 *line0 = (UINT8 *)imIn->image8[yy + 0]; for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - ss0 = line0[xx + 0] + line0[xx + 1] + line0[xx + 2]; + int xx = box[0] + x*xscale; + UINT32 ss0 = line0[xx + 0] + line0[xx + 1] + line0[xx + 2]; imOut->image8[y][x] = ((ss0 + amend) * multiplier) >> 24; } } @@ -684,48 +496,26 @@ ImagingReduce3x1(Imaging imOut, Imaging imIn, int box[4]) { for (y = 0; y < box[3] / yscale; y++) { int yy = box[1] + y * yscale; UINT8 *line0 = (UINT8 *)imIn->image[yy + 0]; - if (imIn->bands == 2) { - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line0[xx * 4 + 4] + line0[xx * 4 + 8]; - ss3 = line0[xx * 4 + 3] + line0[xx * 4 + 7] + line0[xx * 4 + 11]; - v = MAKE_UINT32( - ((ss0 + amend) * multiplier) >> 24, - 0, - 0, - ((ss3 + amend) * multiplier) >> 24); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } - } else if (imIn->bands == 3) { - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line0[xx * 4 + 4] + line0[xx * 4 + 8]; - ss1 = line0[xx * 4 + 1] + line0[xx * 4 + 5] + line0[xx * 4 + 9]; - ss2 = line0[xx * 4 + 2] + line0[xx * 4 + 6] + line0[xx * 4 + 10]; - v = MAKE_UINT32( - ((ss0 + amend) * multiplier) >> 24, - ((ss1 + amend) * multiplier) >> 24, - ((ss2 + amend) * multiplier) >> 24, - 0); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } - } else { // bands == 4 - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line0[xx * 4 + 4] + line0[xx * 4 + 8]; - ss1 = line0[xx * 4 + 1] + line0[xx * 4 + 5] + line0[xx * 4 + 9]; - ss2 = line0[xx * 4 + 2] + line0[xx * 4 + 6] + line0[xx * 4 + 10]; - ss3 = line0[xx * 4 + 3] + line0[xx * 4 + 7] + line0[xx * 4 + 11]; - v = MAKE_UINT32( - ((ss0 + amend) * multiplier) >> 24, - ((ss1 + amend) * multiplier) >> 24, - ((ss2 + amend) * multiplier) >> 24, - ((ss3 + amend) * multiplier) >> 24); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } + for (x = 0; x < box[2] / xscale; x++) { + int xx = box[0] + x*xscale; + __m128i mm_ss0; + + __m128i pix0 = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *) &line0[xx*4]), + _mm_setzero_si128()); + pix0 = _mm_add_epi32( + _mm_unpacklo_epi16(pix0, _mm_setzero_si128()), + _mm_unpackhi_epi16(pix0, _mm_setzero_si128())); + + mm_ss0 = _mm_add_epi32( + mm_cvtepu8_epi32(&line0[xx*4 + 8]), + pix0); + + mm_ss0 = _mm_add_epi32(mm_ss0, mm_amend); + mm_ss0 = _mm_mullo_epi32(mm_ss0, mm_multiplier); + mm_ss0 = _mm_srli_epi32(mm_ss0, 24); + mm_ss0 = _mm_packs_epi32(mm_ss0, mm_ss0); + ((UINT32 *)imOut->image[y])[x] = _mm_cvtsi128_si32(_mm_packus_epi16(mm_ss0, mm_ss0)); } } } @@ -737,9 +527,10 @@ ImagingReduce3x3(Imaging imOut, Imaging imIn, int box[4]) { */ int xscale = 3, yscale = 3; int x, y; - UINT32 ss0, ss1, ss2, ss3; UINT32 multiplier = division_UINT32(yscale * xscale, 8); UINT32 amend = yscale * xscale / 2; + __m128i mm_multiplier = _mm_set1_epi32(multiplier); + __m128i mm_amend = _mm_set1_epi32(amend); if (imIn->image8) { for (y = 0; y < box[3] / yscale; y++) { @@ -748,10 +539,10 @@ ImagingReduce3x3(Imaging imOut, Imaging imIn, int box[4]) { UINT8 *line1 = (UINT8 *)imIn->image8[yy + 1]; UINT8 *line2 = (UINT8 *)imIn->image8[yy + 2]; for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - ss0 = line0[xx + 0] + line0[xx + 1] + line0[xx + 2] + line1[xx + 0] + - line1[xx + 1] + line1[xx + 2] + line2[xx + 0] + line2[xx + 1] + - line2[xx + 2]; + int xx = box[0] + x*xscale; + UINT32 ss0 = line0[xx + 0] + line0[xx + 1] + line0[xx + 2] + + line1[xx + 0] + line1[xx + 1] + line1[xx + 2] + + line2[xx + 0] + line2[xx + 1] + line2[xx + 2]; imOut->image8[y][x] = ((ss0 + amend) * multiplier) >> 24; } } @@ -761,340 +552,38 @@ ImagingReduce3x3(Imaging imOut, Imaging imIn, int box[4]) { UINT8 *line0 = (UINT8 *)imIn->image[yy + 0]; UINT8 *line1 = (UINT8 *)imIn->image[yy + 1]; UINT8 *line2 = (UINT8 *)imIn->image[yy + 2]; - if (imIn->bands == 2) { - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line0[xx * 4 + 4] + line0[xx * 4 + 8] + - line1[xx * 4 + 0] + line1[xx * 4 + 4] + line1[xx * 4 + 8] + - line2[xx * 4 + 0] + line2[xx * 4 + 4] + line2[xx * 4 + 8]; - ss3 = line0[xx * 4 + 3] + line0[xx * 4 + 7] + line0[xx * 4 + 11] + - line1[xx * 4 + 3] + line1[xx * 4 + 7] + line1[xx * 4 + 11] + - line2[xx * 4 + 3] + line2[xx * 4 + 7] + line2[xx * 4 + 11]; - v = MAKE_UINT32( - ((ss0 + amend) * multiplier) >> 24, - 0, - 0, - ((ss3 + amend) * multiplier) >> 24); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } - } else if (imIn->bands == 3) { - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line0[xx * 4 + 4] + line0[xx * 4 + 8] + - line1[xx * 4 + 0] + line1[xx * 4 + 4] + line1[xx * 4 + 8] + - line2[xx * 4 + 0] + line2[xx * 4 + 4] + line2[xx * 4 + 8]; - ss1 = line0[xx * 4 + 1] + line0[xx * 4 + 5] + line0[xx * 4 + 9] + - line1[xx * 4 + 1] + line1[xx * 4 + 5] + line1[xx * 4 + 9] + - line2[xx * 4 + 1] + line2[xx * 4 + 5] + line2[xx * 4 + 9]; - ss2 = line0[xx * 4 + 2] + line0[xx * 4 + 6] + line0[xx * 4 + 10] + - line1[xx * 4 + 2] + line1[xx * 4 + 6] + line1[xx * 4 + 10] + - line2[xx * 4 + 2] + line2[xx * 4 + 6] + line2[xx * 4 + 10]; - v = MAKE_UINT32( - ((ss0 + amend) * multiplier) >> 24, - ((ss1 + amend) * multiplier) >> 24, - ((ss2 + amend) * multiplier) >> 24, - 0); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } - } else { // bands == 4 - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line0[xx * 4 + 4] + line0[xx * 4 + 8] + - line1[xx * 4 + 0] + line1[xx * 4 + 4] + line1[xx * 4 + 8] + - line2[xx * 4 + 0] + line2[xx * 4 + 4] + line2[xx * 4 + 8]; - ss1 = line0[xx * 4 + 1] + line0[xx * 4 + 5] + line0[xx * 4 + 9] + - line1[xx * 4 + 1] + line1[xx * 4 + 5] + line1[xx * 4 + 9] + - line2[xx * 4 + 1] + line2[xx * 4 + 5] + line2[xx * 4 + 9]; - ss2 = line0[xx * 4 + 2] + line0[xx * 4 + 6] + line0[xx * 4 + 10] + - line1[xx * 4 + 2] + line1[xx * 4 + 6] + line1[xx * 4 + 10] + - line2[xx * 4 + 2] + line2[xx * 4 + 6] + line2[xx * 4 + 10]; - ss3 = line0[xx * 4 + 3] + line0[xx * 4 + 7] + line0[xx * 4 + 11] + - line1[xx * 4 + 3] + line1[xx * 4 + 7] + line1[xx * 4 + 11] + - line2[xx * 4 + 3] + line2[xx * 4 + 7] + line2[xx * 4 + 11]; - v = MAKE_UINT32( - ((ss0 + amend) * multiplier) >> 24, - ((ss1 + amend) * multiplier) >> 24, - ((ss2 + amend) * multiplier) >> 24, - ((ss3 + amend) * multiplier) >> 24); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } - } - } - } -} - -void -ImagingReduce4x4(Imaging imOut, Imaging imIn, int box[4]) { - /* Optimized implementation for xscale = 4 and yscale = 4. - */ - int xscale = 4, yscale = 4; - int x, y; - UINT32 ss0, ss1, ss2, ss3; - UINT32 amend = yscale * xscale / 2; - - if (imIn->image8) { - for (y = 0; y < box[3] / yscale; y++) { - int yy = box[1] + y * yscale; - UINT8 *line0 = (UINT8 *)imIn->image8[yy + 0]; - UINT8 *line1 = (UINT8 *)imIn->image8[yy + 1]; - UINT8 *line2 = (UINT8 *)imIn->image8[yy + 2]; - UINT8 *line3 = (UINT8 *)imIn->image8[yy + 3]; - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - ss0 = line0[xx + 0] + line0[xx + 1] + line0[xx + 2] + line0[xx + 3] + - line1[xx + 0] + line1[xx + 1] + line1[xx + 2] + line1[xx + 3] + - line2[xx + 0] + line2[xx + 1] + line2[xx + 2] + line2[xx + 3] + - line3[xx + 0] + line3[xx + 1] + line3[xx + 2] + line3[xx + 3]; - imOut->image8[y][x] = (ss0 + amend) >> 4; - } - } - } else { - for (y = 0; y < box[3] / yscale; y++) { - int yy = box[1] + y * yscale; - UINT8 *line0 = (UINT8 *)imIn->image[yy + 0]; - UINT8 *line1 = (UINT8 *)imIn->image[yy + 1]; - UINT8 *line2 = (UINT8 *)imIn->image[yy + 2]; - UINT8 *line3 = (UINT8 *)imIn->image[yy + 3]; - if (imIn->bands == 2) { - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line0[xx * 4 + 4] + line0[xx * 4 + 8] + - line0[xx * 4 + 12] + line1[xx * 4 + 0] + line1[xx * 4 + 4] + - line1[xx * 4 + 8] + line1[xx * 4 + 12] + line2[xx * 4 + 0] + - line2[xx * 4 + 4] + line2[xx * 4 + 8] + line2[xx * 4 + 12] + - line3[xx * 4 + 0] + line3[xx * 4 + 4] + line3[xx * 4 + 8] + - line3[xx * 4 + 12]; - ss3 = line0[xx * 4 + 3] + line0[xx * 4 + 7] + line0[xx * 4 + 11] + - line0[xx * 4 + 15] + line1[xx * 4 + 3] + line1[xx * 4 + 7] + - line1[xx * 4 + 11] + line1[xx * 4 + 15] + line2[xx * 4 + 3] + - line2[xx * 4 + 7] + line2[xx * 4 + 11] + line2[xx * 4 + 15] + - line3[xx * 4 + 3] + line3[xx * 4 + 7] + line3[xx * 4 + 11] + - line3[xx * 4 + 15]; - v = MAKE_UINT32((ss0 + amend) >> 4, 0, 0, (ss3 + amend) >> 4); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } - } else if (imIn->bands == 3) { - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line0[xx * 4 + 4] + line0[xx * 4 + 8] + - line0[xx * 4 + 12] + line1[xx * 4 + 0] + line1[xx * 4 + 4] + - line1[xx * 4 + 8] + line1[xx * 4 + 12] + line2[xx * 4 + 0] + - line2[xx * 4 + 4] + line2[xx * 4 + 8] + line2[xx * 4 + 12] + - line3[xx * 4 + 0] + line3[xx * 4 + 4] + line3[xx * 4 + 8] + - line3[xx * 4 + 12]; - ss1 = line0[xx * 4 + 1] + line0[xx * 4 + 5] + line0[xx * 4 + 9] + - line0[xx * 4 + 13] + line1[xx * 4 + 1] + line1[xx * 4 + 5] + - line1[xx * 4 + 9] + line1[xx * 4 + 13] + line2[xx * 4 + 1] + - line2[xx * 4 + 5] + line2[xx * 4 + 9] + line2[xx * 4 + 13] + - line3[xx * 4 + 1] + line3[xx * 4 + 5] + line3[xx * 4 + 9] + - line3[xx * 4 + 13]; - ss2 = line0[xx * 4 + 2] + line0[xx * 4 + 6] + line0[xx * 4 + 10] + - line0[xx * 4 + 14] + line1[xx * 4 + 2] + line1[xx * 4 + 6] + - line1[xx * 4 + 10] + line1[xx * 4 + 14] + line2[xx * 4 + 2] + - line2[xx * 4 + 6] + line2[xx * 4 + 10] + line2[xx * 4 + 14] + - line3[xx * 4 + 2] + line3[xx * 4 + 6] + line3[xx * 4 + 10] + - line3[xx * 4 + 14]; - v = MAKE_UINT32( - (ss0 + amend) >> 4, (ss1 + amend) >> 4, (ss2 + amend) >> 4, 0); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } - } else { // bands == 4 - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line0[xx * 4 + 4] + line0[xx * 4 + 8] + - line0[xx * 4 + 12] + line1[xx * 4 + 0] + line1[xx * 4 + 4] + - line1[xx * 4 + 8] + line1[xx * 4 + 12] + line2[xx * 4 + 0] + - line2[xx * 4 + 4] + line2[xx * 4 + 8] + line2[xx * 4 + 12] + - line3[xx * 4 + 0] + line3[xx * 4 + 4] + line3[xx * 4 + 8] + - line3[xx * 4 + 12]; - ss1 = line0[xx * 4 + 1] + line0[xx * 4 + 5] + line0[xx * 4 + 9] + - line0[xx * 4 + 13] + line1[xx * 4 + 1] + line1[xx * 4 + 5] + - line1[xx * 4 + 9] + line1[xx * 4 + 13] + line2[xx * 4 + 1] + - line2[xx * 4 + 5] + line2[xx * 4 + 9] + line2[xx * 4 + 13] + - line3[xx * 4 + 1] + line3[xx * 4 + 5] + line3[xx * 4 + 9] + - line3[xx * 4 + 13]; - ss2 = line0[xx * 4 + 2] + line0[xx * 4 + 6] + line0[xx * 4 + 10] + - line0[xx * 4 + 14] + line1[xx * 4 + 2] + line1[xx * 4 + 6] + - line1[xx * 4 + 10] + line1[xx * 4 + 14] + line2[xx * 4 + 2] + - line2[xx * 4 + 6] + line2[xx * 4 + 10] + line2[xx * 4 + 14] + - line3[xx * 4 + 2] + line3[xx * 4 + 6] + line3[xx * 4 + 10] + - line3[xx * 4 + 14]; - ss3 = line0[xx * 4 + 3] + line0[xx * 4 + 7] + line0[xx * 4 + 11] + - line0[xx * 4 + 15] + line1[xx * 4 + 3] + line1[xx * 4 + 7] + - line1[xx * 4 + 11] + line1[xx * 4 + 15] + line2[xx * 4 + 3] + - line2[xx * 4 + 7] + line2[xx * 4 + 11] + line2[xx * 4 + 15] + - line3[xx * 4 + 3] + line3[xx * 4 + 7] + line3[xx * 4 + 11] + - line3[xx * 4 + 15]; - v = MAKE_UINT32( - (ss0 + amend) >> 4, - (ss1 + amend) >> 4, - (ss2 + amend) >> 4, - (ss3 + amend) >> 4); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } - } - } - } -} - -void -ImagingReduce5x5(Imaging imOut, Imaging imIn, int box[4]) { - /* Fast special case for xscale = 5 and yscale = 5. - */ - int xscale = 5, yscale = 5; - int x, y; - UINT32 ss0, ss1, ss2, ss3; - UINT32 multiplier = division_UINT32(yscale * xscale, 8); - UINT32 amend = yscale * xscale / 2; - - if (imIn->image8) { - for (y = 0; y < box[3] / yscale; y++) { - int yy = box[1] + y * yscale; - UINT8 *line0 = (UINT8 *)imIn->image8[yy + 0]; - UINT8 *line1 = (UINT8 *)imIn->image8[yy + 1]; - UINT8 *line2 = (UINT8 *)imIn->image8[yy + 2]; - UINT8 *line3 = (UINT8 *)imIn->image8[yy + 3]; - UINT8 *line4 = (UINT8 *)imIn->image8[yy + 4]; for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - ss0 = line0[xx + 0] + line0[xx + 1] + line0[xx + 2] + line0[xx + 3] + - line0[xx + 4] + line1[xx + 0] + line1[xx + 1] + line1[xx + 2] + - line1[xx + 3] + line1[xx + 4] + line2[xx + 0] + line2[xx + 1] + - line2[xx + 2] + line2[xx + 3] + line2[xx + 4] + line3[xx + 0] + - line3[xx + 1] + line3[xx + 2] + line3[xx + 3] + line3[xx + 4] + - line4[xx + 0] + line4[xx + 1] + line4[xx + 2] + line4[xx + 3] + - line4[xx + 4]; - imOut->image8[y][x] = ((ss0 + amend) * multiplier) >> 24; - } - } - } else { - for (y = 0; y < box[3] / yscale; y++) { - int yy = box[1] + y * yscale; - UINT8 *line0 = (UINT8 *)imIn->image[yy + 0]; - UINT8 *line1 = (UINT8 *)imIn->image[yy + 1]; - UINT8 *line2 = (UINT8 *)imIn->image[yy + 2]; - UINT8 *line3 = (UINT8 *)imIn->image[yy + 3]; - UINT8 *line4 = (UINT8 *)imIn->image[yy + 4]; - if (imIn->bands == 2) { - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line0[xx * 4 + 4] + line0[xx * 4 + 8] + - line0[xx * 4 + 12] + line0[xx * 4 + 16] + line1[xx * 4 + 0] + - line1[xx * 4 + 4] + line1[xx * 4 + 8] + line1[xx * 4 + 12] + - line1[xx * 4 + 16] + line2[xx * 4 + 0] + line2[xx * 4 + 4] + - line2[xx * 4 + 8] + line2[xx * 4 + 12] + line2[xx * 4 + 16] + - line3[xx * 4 + 0] + line3[xx * 4 + 4] + line3[xx * 4 + 8] + - line3[xx * 4 + 12] + line3[xx * 4 + 16] + line4[xx * 4 + 0] + - line4[xx * 4 + 4] + line4[xx * 4 + 8] + line4[xx * 4 + 12] + - line4[xx * 4 + 16]; - ss3 = line0[xx * 4 + 3] + line0[xx * 4 + 7] + line0[xx * 4 + 11] + - line0[xx * 4 + 15] + line0[xx * 4 + 19] + line1[xx * 4 + 3] + - line1[xx * 4 + 7] + line1[xx * 4 + 11] + line1[xx * 4 + 15] + - line1[xx * 4 + 19] + line2[xx * 4 + 3] + line2[xx * 4 + 7] + - line2[xx * 4 + 11] + line2[xx * 4 + 15] + line2[xx * 4 + 19] + - line3[xx * 4 + 3] + line3[xx * 4 + 7] + line3[xx * 4 + 11] + - line3[xx * 4 + 15] + line3[xx * 4 + 19] + line4[xx * 4 + 3] + - line4[xx * 4 + 7] + line4[xx * 4 + 11] + line4[xx * 4 + 15] + - line4[xx * 4 + 19]; - v = MAKE_UINT32( - ((ss0 + amend) * multiplier) >> 24, - 0, - 0, - ((ss3 + amend) * multiplier) >> 24); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } - } else if (imIn->bands == 3) { - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line0[xx * 4 + 4] + line0[xx * 4 + 8] + - line0[xx * 4 + 12] + line0[xx * 4 + 16] + line1[xx * 4 + 0] + - line1[xx * 4 + 4] + line1[xx * 4 + 8] + line1[xx * 4 + 12] + - line1[xx * 4 + 16] + line2[xx * 4 + 0] + line2[xx * 4 + 4] + - line2[xx * 4 + 8] + line2[xx * 4 + 12] + line2[xx * 4 + 16] + - line3[xx * 4 + 0] + line3[xx * 4 + 4] + line3[xx * 4 + 8] + - line3[xx * 4 + 12] + line3[xx * 4 + 16] + line4[xx * 4 + 0] + - line4[xx * 4 + 4] + line4[xx * 4 + 8] + line4[xx * 4 + 12] + - line4[xx * 4 + 16]; - ss1 = line0[xx * 4 + 1] + line0[xx * 4 + 5] + line0[xx * 4 + 9] + - line0[xx * 4 + 13] + line0[xx * 4 + 17] + line1[xx * 4 + 1] + - line1[xx * 4 + 5] + line1[xx * 4 + 9] + line1[xx * 4 + 13] + - line1[xx * 4 + 17] + line2[xx * 4 + 1] + line2[xx * 4 + 5] + - line2[xx * 4 + 9] + line2[xx * 4 + 13] + line2[xx * 4 + 17] + - line3[xx * 4 + 1] + line3[xx * 4 + 5] + line3[xx * 4 + 9] + - line3[xx * 4 + 13] + line3[xx * 4 + 17] + line4[xx * 4 + 1] + - line4[xx * 4 + 5] + line4[xx * 4 + 9] + line4[xx * 4 + 13] + - line4[xx * 4 + 17]; - ss2 = line0[xx * 4 + 2] + line0[xx * 4 + 6] + line0[xx * 4 + 10] + - line0[xx * 4 + 14] + line0[xx * 4 + 18] + line1[xx * 4 + 2] + - line1[xx * 4 + 6] + line1[xx * 4 + 10] + line1[xx * 4 + 14] + - line1[xx * 4 + 18] + line2[xx * 4 + 2] + line2[xx * 4 + 6] + - line2[xx * 4 + 10] + line2[xx * 4 + 14] + line2[xx * 4 + 18] + - line3[xx * 4 + 2] + line3[xx * 4 + 6] + line3[xx * 4 + 10] + - line3[xx * 4 + 14] + line3[xx * 4 + 18] + line4[xx * 4 + 2] + - line4[xx * 4 + 6] + line4[xx * 4 + 10] + line4[xx * 4 + 14] + - line4[xx * 4 + 18]; - v = MAKE_UINT32( - ((ss0 + amend) * multiplier) >> 24, - ((ss1 + amend) * multiplier) >> 24, - ((ss2 + amend) * multiplier) >> 24, - 0); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } - } else { // bands == 4 - for (x = 0; x < box[2] / xscale; x++) { - int xx = box[0] + x * xscale; - UINT32 v; - ss0 = line0[xx * 4 + 0] + line0[xx * 4 + 4] + line0[xx * 4 + 8] + - line0[xx * 4 + 12] + line0[xx * 4 + 16] + line1[xx * 4 + 0] + - line1[xx * 4 + 4] + line1[xx * 4 + 8] + line1[xx * 4 + 12] + - line1[xx * 4 + 16] + line2[xx * 4 + 0] + line2[xx * 4 + 4] + - line2[xx * 4 + 8] + line2[xx * 4 + 12] + line2[xx * 4 + 16] + - line3[xx * 4 + 0] + line3[xx * 4 + 4] + line3[xx * 4 + 8] + - line3[xx * 4 + 12] + line3[xx * 4 + 16] + line4[xx * 4 + 0] + - line4[xx * 4 + 4] + line4[xx * 4 + 8] + line4[xx * 4 + 12] + - line4[xx * 4 + 16]; - ss1 = line0[xx * 4 + 1] + line0[xx * 4 + 5] + line0[xx * 4 + 9] + - line0[xx * 4 + 13] + line0[xx * 4 + 17] + line1[xx * 4 + 1] + - line1[xx * 4 + 5] + line1[xx * 4 + 9] + line1[xx * 4 + 13] + - line1[xx * 4 + 17] + line2[xx * 4 + 1] + line2[xx * 4 + 5] + - line2[xx * 4 + 9] + line2[xx * 4 + 13] + line2[xx * 4 + 17] + - line3[xx * 4 + 1] + line3[xx * 4 + 5] + line3[xx * 4 + 9] + - line3[xx * 4 + 13] + line3[xx * 4 + 17] + line4[xx * 4 + 1] + - line4[xx * 4 + 5] + line4[xx * 4 + 9] + line4[xx * 4 + 13] + - line4[xx * 4 + 17]; - ss2 = line0[xx * 4 + 2] + line0[xx * 4 + 6] + line0[xx * 4 + 10] + - line0[xx * 4 + 14] + line0[xx * 4 + 18] + line1[xx * 4 + 2] + - line1[xx * 4 + 6] + line1[xx * 4 + 10] + line1[xx * 4 + 14] + - line1[xx * 4 + 18] + line2[xx * 4 + 2] + line2[xx * 4 + 6] + - line2[xx * 4 + 10] + line2[xx * 4 + 14] + line2[xx * 4 + 18] + - line3[xx * 4 + 2] + line3[xx * 4 + 6] + line3[xx * 4 + 10] + - line3[xx * 4 + 14] + line3[xx * 4 + 18] + line4[xx * 4 + 2] + - line4[xx * 4 + 6] + line4[xx * 4 + 10] + line4[xx * 4 + 14] + - line4[xx * 4 + 18]; - ss3 = line0[xx * 4 + 3] + line0[xx * 4 + 7] + line0[xx * 4 + 11] + - line0[xx * 4 + 15] + line0[xx * 4 + 19] + line1[xx * 4 + 3] + - line1[xx * 4 + 7] + line1[xx * 4 + 11] + line1[xx * 4 + 15] + - line1[xx * 4 + 19] + line2[xx * 4 + 3] + line2[xx * 4 + 7] + - line2[xx * 4 + 11] + line2[xx * 4 + 15] + line2[xx * 4 + 19] + - line3[xx * 4 + 3] + line3[xx * 4 + 7] + line3[xx * 4 + 11] + - line3[xx * 4 + 15] + line3[xx * 4 + 19] + line4[xx * 4 + 3] + - line4[xx * 4 + 7] + line4[xx * 4 + 11] + line4[xx * 4 + 15] + - line4[xx * 4 + 19]; - v = MAKE_UINT32( - ((ss0 + amend) * multiplier) >> 24, - ((ss1 + amend) * multiplier) >> 24, - ((ss2 + amend) * multiplier) >> 24, - ((ss3 + amend) * multiplier) >> 24); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); - } + int xx = box[0] + x*xscale; + __m128i mm_ss0; + + __m128i pix0 = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *) &line0[xx*4]), + _mm_setzero_si128()); + __m128i pix1 = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *) &line1[xx*4]), + _mm_setzero_si128()); + __m128i pix2 = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *) &line2[xx*4]), + _mm_setzero_si128()); + + pix0 = _mm_add_epi16(_mm_add_epi16(pix0, pix1), pix2); + pix0 = _mm_add_epi32( + _mm_unpacklo_epi16(pix0, _mm_setzero_si128()), + _mm_unpackhi_epi16(pix0, _mm_setzero_si128())); + + mm_ss0 = _mm_add_epi32( + _mm_add_epi32( + mm_cvtepu8_epi32(&line0[xx*4 + 8]), + mm_cvtepu8_epi32(&line1[xx*4 + 8])), + _mm_add_epi32( + mm_cvtepu8_epi32(&line2[xx*4 + 8]), + pix0)); + + mm_ss0 = _mm_add_epi32(mm_ss0, mm_amend); + mm_ss0 = _mm_mullo_epi32(mm_ss0, mm_multiplier); + mm_ss0 = _mm_srli_epi32(mm_ss0, 24); + mm_ss0 = _mm_packs_epi32(mm_ss0, mm_ss0); + ((UINT32 *)imOut->image[y])[x] = _mm_cvtsi128_si32(_mm_packus_epi16(mm_ss0, mm_ss0)); } } } @@ -1162,78 +651,69 @@ ImagingReduceCorners(Imaging imOut, Imaging imIn, int box[4], int xscale, int ys int scale = (box[2] % xscale) * yscale; UINT32 multiplier = division_UINT32(scale, 8); UINT32 amend = scale / 2; + __m128i mm_multiplier = _mm_set1_epi32(multiplier); + __m128i mm_amend = _mm_set1_epi32(amend); for (y = 0; y < box[3] / yscale; y++) { - int yy_from = box[1] + y * yscale; - UINT32 v; - UINT32 ss0 = amend, ss1 = amend, ss2 = amend, ss3 = amend; + int yy_from = box[1] + y*yscale; + __m128i mm_ss0 = mm_amend; x = box[2] / xscale; for (yy = yy_from; yy < yy_from + yscale; yy++) { UINT8 *line = (UINT8 *)imIn->image[yy]; - for (xx = box[0] + x * xscale; xx < box[0] + box[2]; xx++) { - ss0 += line[xx * 4 + 0]; - ss1 += line[xx * 4 + 1]; - ss2 += line[xx * 4 + 2]; - ss3 += line[xx * 4 + 3]; + for (xx = box[0] + x*xscale; xx < box[0] + box[2]; xx++) { + __m128i pix0 = mm_cvtepu8_epi32(&line[xx*4]); + mm_ss0 = _mm_add_epi32(mm_ss0, pix0); } } - v = MAKE_UINT32( - (ss0 * multiplier) >> 24, - (ss1 * multiplier) >> 24, - (ss2 * multiplier) >> 24, - (ss3 * multiplier) >> 24); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); + mm_ss0 = _mm_mullo_epi32(mm_ss0, mm_multiplier); + mm_ss0 = _mm_srli_epi32(mm_ss0, 24); + mm_ss0 = _mm_packs_epi32(mm_ss0, mm_ss0); + ((UINT32 *)imOut->image[y])[x] = _mm_cvtsi128_si32(_mm_packus_epi16(mm_ss0, mm_ss0)); } } if (box[3] % yscale) { int scale = xscale * (box[3] % yscale); UINT32 multiplier = division_UINT32(scale, 8); UINT32 amend = scale / 2; + __m128i mm_multiplier = _mm_set1_epi32(multiplier); + __m128i mm_amend = _mm_set1_epi32(amend); y = box[3] / yscale; for (x = 0; x < box[2] / xscale; x++) { - int xx_from = box[0] + x * xscale; - UINT32 v; - UINT32 ss0 = amend, ss1 = amend, ss2 = amend, ss3 = amend; - for (yy = box[1] + y * yscale; yy < box[1] + box[3]; yy++) { + int xx_from = box[0] + x*xscale; + __m128i mm_ss0 = mm_amend; + for (yy = box[1] + y*yscale; yy < box[1] + box[3]; yy++) { UINT8 *line = (UINT8 *)imIn->image[yy]; for (xx = xx_from; xx < xx_from + xscale; xx++) { - ss0 += line[xx * 4 + 0]; - ss1 += line[xx * 4 + 1]; - ss2 += line[xx * 4 + 2]; - ss3 += line[xx * 4 + 3]; + __m128i pix0 = mm_cvtepu8_epi32(&line[xx*4]); + mm_ss0 = _mm_add_epi32(mm_ss0, pix0); } } - v = MAKE_UINT32( - (ss0 * multiplier) >> 24, - (ss1 * multiplier) >> 24, - (ss2 * multiplier) >> 24, - (ss3 * multiplier) >> 24); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); + mm_ss0 = _mm_mullo_epi32(mm_ss0, mm_multiplier); + mm_ss0 = _mm_srli_epi32(mm_ss0, 24); + mm_ss0 = _mm_packs_epi32(mm_ss0, mm_ss0); + ((UINT32 *)imOut->image[y])[x] = _mm_cvtsi128_si32(_mm_packus_epi16(mm_ss0, mm_ss0)); } } if (box[2] % xscale && box[3] % yscale) { int scale = (box[2] % xscale) * (box[3] % yscale); UINT32 multiplier = division_UINT32(scale, 8); UINT32 amend = scale / 2; - UINT32 v; - UINT32 ss0 = amend, ss1 = amend, ss2 = amend, ss3 = amend; + __m128i mm_multiplier = _mm_set1_epi32(multiplier); + __m128i mm_amend = _mm_set1_epi32(amend); + __m128i mm_ss0 = mm_amend; x = box[2] / xscale; y = box[3] / yscale; for (yy = box[1] + y * yscale; yy < box[1] + box[3]; yy++) { UINT8 *line = (UINT8 *)imIn->image[yy]; - for (xx = box[0] + x * xscale; xx < box[0] + box[2]; xx++) { - ss0 += line[xx * 4 + 0]; - ss1 += line[xx * 4 + 1]; - ss2 += line[xx * 4 + 2]; - ss3 += line[xx * 4 + 3]; + for (xx = box[0] + x*xscale; xx < box[0] + box[2]; xx++) { + __m128i pix0 = mm_cvtepu8_epi32(&line[xx*4]); + mm_ss0 = _mm_add_epi32(mm_ss0, pix0); } } - v = MAKE_UINT32( - (ss0 * multiplier) >> 24, - (ss1 * multiplier) >> 24, - (ss2 * multiplier) >> 24, - (ss3 * multiplier) >> 24); - memcpy(imOut->image[y] + x * sizeof(v), &v, sizeof(v)); + mm_ss0 = _mm_mullo_epi32(mm_ss0, mm_multiplier); + mm_ss0 = _mm_srli_epi32(mm_ss0, 24); + mm_ss0 = _mm_packs_epi32(mm_ss0, mm_ss0); + ((UINT32 *)imOut->image[y])[x] = _mm_cvtsi128_si32(_mm_packus_epi16(mm_ss0, mm_ss0)); } } } @@ -1452,15 +932,11 @@ ImagingReduce(Imaging imIn, int xscale, int yscale, int box[4]) { } else { ImagingReduceNx1(imOut, imIn, box, xscale); } - } else if (xscale == yscale && xscale <= 5) { + } else if (xscale == yscale && xscale <= 3) { if (xscale == 2) { ImagingReduce2x2(imOut, imIn, box); } else if (xscale == 3) { ImagingReduce3x3(imOut, imIn, box); - } else if (xscale == 4) { - ImagingReduce4x4(imOut, imIn, box); - } else { - ImagingReduce5x5(imOut, imIn, box); } } else { ImagingReduceNxN(imOut, imIn, box, xscale, yscale);