/* prim_16to32bpp_opt.c * 16-bit to 32-bit color conversion via SSE/Neon * vi:ts=4 sw=4: * * (c) Copyright 2014 Hewlett-Packard Development Company, L.P. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #ifdef WITH_SSE2 #include #include /* #elif defined(WITH_NEON) */ /* #include */ #endif /* WITH_SSE2 */ #include "prim_internal.h" #include "prim_16to32bpp.h" #ifdef WITH_SSE2 /* ------------------------------------------------------------------------- */ /* Note: _no_invert and _invert could be coded with variables as shift * amounts and a single routine, but tests showed that was much slower. */ static pstatus_t sse3_RGB565ToARGB_16u32u_C3C4_no_invert( const UINT16* pSrc, INT32 srcStep, UINT32* pDst, INT32 dstStep, UINT32 width, UINT32 height, BOOL alpha) { const BYTE *src = (const BYTE *) pSrc; BYTE *dst = (BYTE *) pDst; int h; int srcRowBump = srcStep - (width * sizeof(UINT16)); int dstRowBump = dstStep - (width * sizeof(UINT32)); __m128i R0, R1, R2, R_FC00, R_0300, R_00F8, R_0007, R_alpha; R_FC00 = _mm_set1_epi16(0xFC00); R_0300 = _mm_set1_epi16(0x0300); R_00F8 = _mm_set1_epi16(0x00F8); R_0007 = _mm_set1_epi16(0x0007); if (alpha) R_alpha = _mm_set1_epi32(0xFF00FF00U); else R_alpha = _mm_set1_epi32(0x00000000U); for (h=0; h width) startup = width; general_RGB565ToARGB_16u32u_C3C4((const UINT16*) src, srcStep, (UINT32*) dst, dstStep, startup, 1, alpha, FALSE); src += startup * sizeof(UINT16); dst += startup * sizeof(UINT32); w -= startup; } /* The main loop handles eight pixels at a time. */ while (w >= 8) { /* If off-stride, use the slower load. */ if ((ULONG_PTR) src & 0x0f) R0 = _mm_lddqu_si128((__m128i *) src); else R0 = _mm_load_si128((__m128i *) src); src += (128/8); /* Do the lower two colors, which end up in the lower two bytes. */ /* G = ((P<<5) & 0xFC00) | ((P>>1) & 0x0300) */ R2 = _mm_slli_epi16(R0, 5); R2 = _mm_and_si128(R_FC00, R2); R1 = _mm_srli_epi16(R0, 1); R1 = _mm_and_si128(R_0300, R1); R2 = _mm_or_si128(R1, R2); /* R = ((P<<3) & 0x00F8) | ((P>>2) & 0x0007) */ R1 = _mm_slli_epi16(R0, 3); R1 = _mm_and_si128(R_00F8, R1); R2 = _mm_or_si128(R1, R2); R1 = _mm_srli_epi16(R0, 2); R1 = _mm_and_si128(R_0007, R1); R2 = _mm_or_si128(R1, R2); /* R2 = lowers */ /* Handle the upper color. */ /* B = ((P<<8) & 0x00F8) | ((P<<13) & 0x0007) */ R1 = _mm_srli_epi16(R0, 8); R1 = _mm_and_si128(R_00F8, R1); R0 = _mm_srli_epi16(R0, 13); R0 = _mm_and_si128(R_0007, R0); R1 = _mm_or_si128(R0, R1); /* R1 = uppers */ /* Add alpha (or zero) . */ R1 = _mm_or_si128(R_alpha, R1); /* + alpha */ /* Unpack to intermix the AB and GR pieces. */ R0 = _mm_unpackhi_epi16(R2, R1); R2 = _mm_unpacklo_epi16(R2, R1); /* Store the results. */ _mm_store_si128((__m128i *) dst, R2); dst += (128/8); _mm_store_si128((__m128i *) dst, R0); dst += (128/8); w -= 8; } /* Handle any remainder. */ if (w > 0) { general_RGB565ToARGB_16u32u_C3C4((const UINT16*) src, srcStep, (UINT32*) dst, dstStep, w, 1, alpha, FALSE); src += w * sizeof(UINT16); dst += w * sizeof(UINT32); } /* Bump to the start of the next row. */ src += srcRowBump; dst += dstRowBump; } return PRIMITIVES_SUCCESS; } /* ------------------------------------------------------------------------- */ static pstatus_t sse3_RGB565ToARGB_16u32u_C3C4_invert( const UINT16* pSrc, INT32 srcStep, UINT32* pDst, INT32 dstStep, UINT32 width, UINT32 height, BOOL alpha) { const BYTE *src = (const BYTE *) pSrc; BYTE *dst = (BYTE *) pDst; int h; int srcRowBump = srcStep - (width * sizeof(UINT16)); int dstRowBump = dstStep - (width * sizeof(UINT32)); __m128i R0, R1, R2, R_FC00, R_0300, R_00F8, R_0007, R_alpha; R_FC00 = _mm_set1_epi16(0xFC00); R_0300 = _mm_set1_epi16(0x0300); R_00F8 = _mm_set1_epi16(0x00F8); R_0007 = _mm_set1_epi16(0x0007); if (alpha) R_alpha = _mm_set1_epi32(0xFF00FF00U); else R_alpha = _mm_set1_epi32(0x00000000U); for (h=0; h width) startup = width; general_RGB565ToARGB_16u32u_C3C4((const UINT16*) src, srcStep, (UINT32*) dst, dstStep, startup, 1, alpha, TRUE); src += startup * sizeof(UINT16); dst += startup * sizeof(UINT32); w -= startup; } /* The main loop handles eight pixels at a time. */ while (w >= 8) { /* Off-stride, slower load. */ if ((ULONG_PTR) src & 0x0f) R0 = _mm_lddqu_si128((__m128i *) src); else R0 = _mm_load_si128((__m128i *) src); src += (128/8); /* Do the lower two colors, which end up in the lower two bytes. */ /* G = ((P<<5) & 0xFC00) | ((P>>1) & 0x0300) */ R2 = _mm_slli_epi16(R0, 5); R2 = _mm_and_si128(R_FC00, R2); R1 = _mm_srli_epi16(R0, 1); R1 = _mm_and_si128(R_0300, R1); R2 = _mm_or_si128(R1, R2); /* B = ((P>>8) & 0x00F8) | ((P>>13) & 0x0007) */ R1 = _mm_srli_epi16(R0, 8); R1 = _mm_and_si128(R_00F8, R1); R2 = _mm_or_si128(R1, R2); R1 = _mm_srli_epi16(R0, 13); R1 = _mm_and_si128(R_0007, R1); R2 = _mm_or_si128(R1, R2); /* R2 = lowers */ /* Handle the upper color. */ /* R = ((P<<3) & 0x00F8) | ((P>>13) & 0x0007) */ R1 = _mm_slli_epi16(R0, 3); R1 = _mm_and_si128(R_00F8, R1); R0 = _mm_srli_epi16(R0, 2); R0 = _mm_and_si128(R_0007, R0); R1 = _mm_or_si128(R0, R1); /* R1 = uppers */ /* Add alpha (or zero) . */ R1 = _mm_or_si128(R_alpha, R1); /* + alpha */ /* Unpack to intermix the AR and GB pieces. */ R0 = _mm_unpackhi_epi16(R2, R1); R2 = _mm_unpacklo_epi16(R2, R1); /* Store the results. */ _mm_store_si128((__m128i *) dst, R2); dst += (128/8); _mm_store_si128((__m128i *) dst, R0); dst += (128/8); w -= 8; } /* Handle any remainder. */ if (w > 0) { general_RGB565ToARGB_16u32u_C3C4((const UINT16*) src, srcStep, (UINT32*) dst, dstStep, w, 1, alpha, TRUE); src += w * sizeof(UINT16); dst += w * sizeof(UINT32); } /* Bump to the start of the next row. */ src += srcRowBump; dst += dstRowBump; } return PRIMITIVES_SUCCESS; } /* ------------------------------------------------------------------------- */ pstatus_t sse3_RGB565ToARGB_16u32u_C3C4( const UINT16* pSrc, INT32 srcStep, UINT32* pDst, INT32 dstStep, UINT32 width, UINT32 height, BOOL alpha, BOOL invert) { if (invert) { return sse3_RGB565ToARGB_16u32u_C3C4_invert(pSrc, srcStep, pDst, dstStep, width, height, alpha); } else { return sse3_RGB565ToARGB_16u32u_C3C4_no_invert(pSrc, srcStep, pDst, dstStep, width, height, alpha); } } #endif /* WITH_SSE2 */ /* ------------------------------------------------------------------------- */ void primitives_init_16to32bpp_opt( primitives_t *prims) { #ifdef WITH_SSE2 if (IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) { prims->RGB565ToARGB_16u32u_C3C4 = sse3_RGB565ToARGB_16u32u_C3C4; } #endif }