/* * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the License); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an AS IS BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* ---------------------------------------------------------------------- * Project: Arm-2D Library * Title: arm-2d_helium.c * Description: Acceleration extensions using Helium. * * $Date: 2. Jun 2021 * $Revision: V.0.6.0 * * Target Processor: Cortex-M cores * * -------------------------------------------------------------------- */ #define __ARM_2D_IMPL__ #include "arm_2d.h" #include "__arm_2d_impl.h" #if defined(__ARM_2D_HAS_HELIUM__) && __ARM_2D_HAS_HELIUM__ #if defined(__clang__) # pragma clang diagnostic push # pragma clang diagnostic ignored "-Wincompatible-pointer-types-discards-qualifiers" # pragma clang diagnostic ignored "-Wcast-qual" # pragma clang diagnostic ignored "-Wcast-align" # pragma clang diagnostic ignored "-Wextra-semi-stmt" # pragma clang diagnostic ignored "-Wsign-conversion" # pragma clang diagnostic ignored "-Wunused-function" # pragma clang diagnostic ignored "-Wimplicit-int-float-conversion" # pragma clang diagnostic ignored "-Wdouble-promotion" # pragma clang diagnostic ignored "-Wunused-parameter" # pragma clang diagnostic ignored "-Wimplicit-float-conversion" # pragma clang diagnostic ignored "-Wimplicit-int-conversion" # pragma clang diagnostic ignored "-Wtautological-pointer-compare" # pragma clang diagnostic ignored "-Wmissing-prototypes" # pragma clang diagnostic ignored "-Wsign-compare" # pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments" # pragma clang diagnostic ignored "-Wpadded" # pragma clang diagnostic ignored "-Wvector-conversion" #endif #include "__arm_2d_paving_helium.h" #include "__arm_2d_math_helium.h" #include "__arm_2d_utils_helium.h" #ifdef __cplusplus extern "C" { #endif /*! \brief initialise the helium service service *! \param none *! \return none */ void __arm_2d_helium_init(void) { /* even if this is empty, do not remove it */ } /*----------------------------------------------------------------------------* * Code Template * *----------------------------------------------------------------------------*/ #define __API_COLOUR rgb16 #define __API_INT_TYPE uint16_t #define __API_INT_TYPE_BIT_NUM 16 #include "__arm_2d_copy_helium.inc" #define __API_COLOUR rgb32 #define __API_INT_TYPE uint32_t #define __API_INT_TYPE_BIT_NUM 32 #include "__arm_2d_copy_helium.inc" /*----------------------------------------------------------------------------* * Copy, Fill and Mirroring * *----------------------------------------------------------------------------*/ void __arm_copy_16_mve(uint16_t * pDst, const uint16_t * pSrc, uint32_t blockSize) { do { mve_pred16_t p = vctp16q(blockSize); vstrhq_p_u16(pDst, vldrhq_z_u16(pSrc, p), p); /* * Decrement the blockSize loop counter * Advance vector source and destination pointers */ pSrc += 8; pDst += 8; blockSize -= 8; } while ((int32_t) blockSize > 0); } void __arm_copy_32_mve(uint32_t * pDst, const uint32_t * pSrc, uint32_t blockSize) { do { mve_pred16_t p = vctp32q(blockSize); vstrwq_p_u32(pDst, vldrwq_z_u32(pSrc, p), p); /* * Decrement the blockSize loop counter * Advance vector source and destination pointers */ pSrc += 4; pDst += 4; blockSize -= 4; } while ((int32_t) blockSize > 0); } void __arm_copy_32_x_mirror_mve(uint32_t * pDst, const uint32_t * pSrc, uint32_t width, uint32_t blockSize) { uint32x4_t offset; uint32_t curOffsetIdx = width - 1; offset = vddupq_wb_u32(&curOffsetIdx, 1); do { mve_pred16_t p = vctp32q(blockSize); uint32x4_t reversedPixVec = vldrwq_gather_shifted_offset_z_u32(pSrc, offset, p); offset = vddupq_x_wb_u32(&curOffsetIdx, 1, p); vstrwq_p_u32(pDst, reversedPixVec, p); /* * Decrement the blockSize loop counter * Advance destination pointers */ pDst += 4; blockSize -= 4; } while ((int32_t) blockSize > 0); } void __arm_copy_16_mve_narrow( uint16_t *phwSource, int16_t iSourceStride, uint16_t *phwTarget, int16_t iTargetStride, arm_2d_size_t *ptCopySize) { #ifdef USE_MVE_INTRINSICS for (int32_t x = 0; x < ptCopySize->iWidth; x++) { uint16x8_t srcStr = vidupq_u16((uint32_t) 0, 1); uint16x8_t dstStr = vidupq_u16((uint32_t) 0, 1); srcStr = srcStr * iSourceStride; dstStr = dstStr * iTargetStride; for (int32_t y = 0; y < ptCopySize->iHeight / 8; y++) { uint16x8_t in = vldrhq_gather_shifted_offset_u16(phwSource, srcStr); srcStr = vaddq_n_u16(srcStr, (8 * iSourceStride)); vstrhq_scatter_shifted_offset_u16(phwTarget, dstStr, in); dstStr = vaddq_n_u16(dstStr, (8 * iTargetStride)); } phwSource++; phwTarget++; } #else __asm volatile( " clrm {r2, r4} \n" " vidup.u16 q0, r2, #1 \n" " vmul.i16 q2, q0, %[iSourceStride] \n" " vidup.u16 q1, r4, #1 \n" " vmul.i16 q3, q1, %[iTargetStride] \n" "3: \n" /* outer loop, iterates over columns */ /* size = ptCopySize->iWidth */ " vmov q0, q2 \n" " vmov q1, q3 \n" /* inner loop, iterates over rows (size = ptCopySize->iHeight) */ " wlstp.16 lr, %[iHeight], 1f \n" ".p2align 2 \n" "2: \n" " vldrh.u16 q4, [%[phwSource], q0, uxtw #1] \n" " vadd.i16 q0, q0, %[iSourceStridex8] \n" " vstrh.16 q4, [%[phwTarget], q1, uxtw #1] \n" " vadd.i16 q1, q1, %[iTargetStridex8] \n" " letp lr, 2b \n" "1: \n" " add.n %[phwSource], #2 \n" " add.n %[phwTarget], #2 \n" " subs %[iWidth], #1 \n" " bne 3b \n" : [phwTarget] "+r"(phwTarget), [phwSource] "+r"(phwSource) : [iHeight] "r"(ptCopySize->iHeight), [iWidth] "r" (ptCopySize->iWidth), [iSourceStride] "r" (iSourceStride),[iSourceStridex8] "r" (iSourceStride*8), [iTargetStride] "r" (iTargetStride),[iTargetStridex8] "r" (iTargetStride*8) : "r2", "r4", "q0", "q1", "q2", "q3", "q4", "memory", "r14", "cc"); #endif } void __arm_copy_32_mve_narrow( uint32_t *pwSource, int32_t iSourceStride, uint32_t *pwTarget, int32_t iTargetStride, arm_2d_size_t *ptCopySize) { #ifdef USE_MVE_INTRINSICS for (int_fast32_t x = 0; x < ptCopySize->iWidth; x++) { uint32x4_t srcStr = vidupq_u32((uint32_t) 0, 1); uint32x4_t dstStr = vidupq_u32((uint32_t) 0, 1); srcStr = srcStr * iSourceStride; dstStr = dstStr * iTargetStride; for (int_fast32_t y = 0; y < ptCopySize->iHeight / 4; y++) { uint32x4_t in = vldrwq_gather_shifted_offset_u32(pwSource, srcStr); srcStr = vaddq_n_u32(srcStr, (4 * iSourceStride)); vstrwq_scatter_shifted_offset_u32(pwTarget, dstStr, in); dstStr = vaddq_n_u32(dstStr, (4 * iTargetStride)); } pwSource++; pwTarget++; } #else __asm volatile( " clrm {r2, r4} \n" " vidup.u32 q0, r2, #1 \n" " vmul.i32 q2, q0, %[iSourceStride] \n" " vidup.u32 q1, r4, #1 \n" " vmul.i32 q3, q1, %[iTargetStride] \n" "3: \n" /* outer loop, iterates over columns */ /* size = ptCopySize->iWidth */ " vmov q0, q2 \n" " vmov q1, q3 \n" /* inner loop, iterates over rows (size = ptCopySize->iHeight) */ " wlstp.32 lr, %[iHeight], 1f \n" ".p2align 2 \n" "2: \n" " vldrw.u32 q4, [%[pwSource], q0, uxtw #2] \n" " vadd.i32 q0, q0, %[iSourceStridex4] \n" " vstrw.32 q4, [%[pwTarget], q1, uxtw #2] \n" " vadd.i32 q1, q1, %[iTargetStridex4] \n" " letp lr, 2b \n" "1: \n" " add.n %[pwSource], #4 \n" " add.n %[pwTarget], #4 \n" " subs %[iWidth], #1 \n" " bne 3b \n" : [pwTarget] "+r"(pwTarget), [pwSource] "+r"(pwSource) : [iHeight] "r"(ptCopySize->iHeight), [iWidth] "r" (ptCopySize->iWidth), [iSourceStride] "r" (iSourceStride),[iSourceStridex4] "r" (iSourceStride*4), [iTargetStride] "r" (iTargetStride),[iTargetStridex4] "r" (iTargetStride*4) : "r2", "r4", "q0", "q1", "q2", "q3", "q4", "memory", "r14", "cc"); #endif } __OVERRIDE_WEAK void __arm_2d_impl_rgb16_copy( uint16_t *phwSource, int16_t iSourceStride, uint16_t *phwTarget, int16_t iTargetStride, arm_2d_size_t *ptCopySize) { /* * 16-bit Narrow copy case: * use column copy with scatter / gather */ if(ptCopySize->iWidth <= 4) { __arm_copy_16_mve_narrow(phwSource, iSourceStride, phwTarget, iTargetStride, ptCopySize); } else if((((uint32_t)phwSource & 3) == 0) && (((uint32_t)phwTarget & 3) == 0) && ((iSourceStride & 3) == 0) && ((iTargetStride & 3) ==0)) { /* * source / dst & strides are 64-bit aligned * use scalar LDRD/STRD, faster than back to back vector VLDR/VSTR on M55 */ __asm volatile( "3: \n" " mov r0, %[phwSource] \n" " mov r1, %[phwTarget] \n" /* scalar version faster (no DTCM bank conflict)*/ " wls lr, %[iWidth], 1f \n" ".p2align 2 \n" "2: \n" " ldrd r2, r3, [r0], #8 \n" " strd r2, r3, [r1], #8 \n" " le lr, 2b \n" "1: \n" // tail " wls lr, %[iWidthTail], 1f \n" ".p2align 2 \n" "2: \n" " ldrh r2, [r0], #2 \n" " strh r2, [r1], #2 \n" " le lr, 2b \n" "1: \n" " add %[phwSource], %[iSourceStride] \n" " add %[phwTarget], %[iTargetStride] \n" " subs %[iHeight], #1 \n" " bne 3b \n" : [phwTarget] "+r"(phwTarget), [phwSource] "+r"(phwSource) : [iHeight] "r"(ptCopySize->iHeight), [iWidth] "r" (ptCopySize->iWidth/4), [iWidthTail] "r" (ptCopySize->iWidth & 3), [iSourceStride] "r" (iSourceStride*sizeof(uint16_t)), [iTargetStride] "r" (iTargetStride*sizeof(uint16_t)) : "r0", "r1", "r2", "r3", "q0", "memory", "r14", "cc" ); } else { /* * generic column major 16-bit 2D copy */ int32_t iWidth = ptCopySize->iWidth; int32_t iHeight = ptCopySize->iHeight; __asm volatile( " mov r2, %[iHeight] \n" "3: \n" " mov r0, %[phwSource] \n" " mov r1, %[phwTarget] \n" " wlstp.16 lr, %[iWidth], 1f \n" ".p2align 2 \n" "2: \n" " vldrh.u16 q0, [r0], #16 \n" " vstrh.16 q0, [r1], #16 \n" " letp lr, 2b \n" "1: \n" " add %[phwSource], %[iSourceStride] \n" " add %[phwTarget], %[iTargetStride] \n" " subs r2, #1 \n" " bne 3b \n" : [phwTarget] "+r"(phwTarget), [phwSource] "+r"(phwSource) : [iHeight] "r"(iHeight), [iWidth] "r" (iWidth), [iSourceStride] "r" (iSourceStride*sizeof(uint16_t)), [iTargetStride] "r" (iTargetStride*sizeof(uint16_t)) : "r0", "r1", "r2", "q0", "memory", "r14", "cc"); } } __OVERRIDE_WEAK void __arm_2d_impl_rgb32_copy( uint32_t *pwSource, int16_t iSourceStride, uint32_t *pwTarget, int16_t iTargetStride, arm_2d_size_t *ptCopySize) { if(ptCopySize->iWidth <= 2) { /* * 32-bit Narrow copy case: * use column copy with scatter / gather */ __arm_copy_32_mve_narrow(pwSource, iSourceStride, pwTarget, iTargetStride, ptCopySize); } else if((((uint32_t)pwSource & 3) == 0) && (((uint32_t)pwTarget & 3) == 0) && ((iSourceStride & 3) == 0) && ((iTargetStride & 3) ==0)) { /* * source / dst & strides are 64-bit aligned * use scalar LDRD/STRD, faster than back to back vector VLDR/VSTR on M55 */ __asm volatile( "3: \n" " mov r0, %[pwSource] \n" " mov r1, %[pwTarget] \n" /* scalar version faster (no DTCM bank conflict)*/ " wls lr, %[iWidth], 1f \n" ".p2align 2 \n" "2: \n" " ldrd r2, r3, [r0], #8 \n" " strd r2, r3, [r1], #8 \n" " le lr, 2b \n" "1: \n" // tail " wls lr, %[iWidthTail], 1f \n" ".p2align 2 \n" "2: \n" " ldr r2, [r0], #4 \n" " str r2, [r1], #4 \n" " le lr, 2b \n" "1: \n" " add %[pwSource], %[iSourceStride] \n" " add %[pwTarget], %[iTargetStride] \n" " subs %[iHeight], #1 \n" " bne 3b \n" : [pwTarget] "+r"(pwTarget), [pwSource] "+r"(pwSource) : [iHeight] "r"(ptCopySize->iHeight), [iWidth] "r" (ptCopySize->iWidth/2), [iWidthTail] "r" (ptCopySize->iWidth & 1), [iSourceStride] "r" (iSourceStride*sizeof(uint32_t)), [iTargetStride] "r" (iTargetStride*sizeof(uint32_t)) : "r0", "r1", "r2", "r3", "q0", "memory", "r14", "cc" ); } else { /* * generic column major 32-bit 2D copy */ __asm volatile( "3: \n" " mov r0, %[pwSource] \n" " mov r1, %[pwTarget] \n" " wlstp.32 lr, %[iWidth], 1f \n" ".p2align 2 \n" "2: \n" " vldrw.32 q0, [r0], #16 \n" " vstrw.32 q0, [r1], #16 \n" " letp lr, 2b \n" "1: \n" " add %[pwSource], %[iSourceStride] \n" " add %[pwTarget], %[iTargetStride] \n" " subs %[iHeight], #1 \n" " bne 3b \n" : [pwTarget] "+r"(pwTarget), [pwSource] "+r"(pwSource) : [iHeight] "r"(ptCopySize->iHeight), [iWidth] "r" (ptCopySize->iWidth), [iSourceStride] "r" (iSourceStride*sizeof(uint32_t)), [iTargetStride] "r" (iTargetStride*sizeof(uint32_t)) : "r0", "r1", "q0", "memory", "r14", "cc"); } } /*----------------------------------------------------------------------------* * alpha blending * *----------------------------------------------------------------------------*/ /* RGB16 Mix */ __OVERRIDE_WEAK void __arm_2d_impl_rgb565_alpha_blending( uint16_t *phwSourceBase, int16_t iSourceStride, uint16_t *phwTargetBase, int16_t iTargetStride, arm_2d_size_t *ptCopySize, uint_fast8_t chRatio) { #ifdef USE_MVE_INTRINSICS int32_t blkCnt; uint16_t ratio1x8 = chRatio * 8; uint16_t ratio1x4 = chRatio * 4; uint16_t ratio2x8 = (256 - chRatio) * 8; uint16_t ratio2x4 = (256 - chRatio) * 4; uint16x8_t vecMaskR = vdupq_n_u16(0x001f); uint16x8_t vecMaskG = vdupq_n_u16(0x003f); uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8); uint16x8_t vecMaskGpck = vdupq_n_u16(0x00fc); for (int32_t y = 0; y < ptCopySize->iHeight; y++) { const uint16_t *phwSource = phwSourceBase; uint16_t *phwTarget = phwTargetBase; blkCnt = ptCopySize->iWidth; do { uint16x8_t vecIn; uint16x8_t vecR0, vecB0, vecG0; uint16x8_t vecR1, vecB1, vecG1; /* unpack 1st stream */ vecIn = vld1q(phwSource); vecR0 = vecIn & vecMaskR; vecB0 = vecIn >> 11; vecG0 = vecIn >> 5; vecG0 = vecG0 & vecMaskG; /* unpack 2nd stream */ vecIn = vld1q(phwTarget); vecR1 = vecIn & vecMaskR; vecB1 = vecIn >> 11; vecG1 = vecIn >> 5; vecG1 = vecG1 & vecMaskG; /* merge */ vecR0 = vecR0 * ratio1x8 + vecR1 * ratio2x8; vecR0 = vecR0 >> 8; vecG0 = vecG0 * ratio1x4 + vecG1 * ratio2x4; vecG0 = vecG0 >> 8; vecB0 = vecB0 * ratio1x8 + vecB1 * ratio2x8; vecB0 = vecB0 >> 8; /* pack */ uint16x8_t vOut = vecR0 >> 3 | vmulq((vecG0 & vecMaskGpck), 8) | vmulq((vecB0 & vecMaskBpck), 256); vst1q(phwTarget, vOut); phwSource += 8; phwTarget += 8; blkCnt -= 8; } while (blkCnt > 0); phwSourceBase += iSourceStride; phwTargetBase += iTargetStride; } #else /* USE_MVE_INTRINSICS */ uint16_t ratio1x8 = chRatio * 8; uint16_t ratio1x4 = chRatio * 4; uint16_t ratio2x8 = (256 - chRatio) * 8; uint16_t ratio2x4 = (256 - chRatio) * 4; uint16x8_t vecMaskR = vdupq_n_u16(0x001f); uint16x8_t vecMaskG = vdupq_n_u16(0x003f); uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8); uint32_t iWidth = ptCopySize->iWidth; int32_t row = ptCopySize->iHeight; uint16x8_t scratch[1]; vst1q((uint16_t *)scratch, vdupq_n_u16(0x00fc)); do { const uint16_t *pSource = phwSourceBase; uint16_t *pTarget = phwTargetBase; register unsigned loopCnt __asm("lr"); loopCnt = iWidth; __asm volatile( ".p2align 2 \n" " vldrh.u16 q4, [%[pTarget]] \n" " vldrh.u16 q5, [%[pSource]], #16 \n" " wlstp.16 lr, %[loopCnt], 1f \n" "2: \n" // B target extraction // right shift by 5 (x 1/32) for M55 friendly // IV / Mul pipe interleaving " vqdmulh.s16 q2, q4, %[rshft5] \n" " vand q7, q4, %[vecMaskR] \n" " vmul.i16 q6, q7, %[ratio2x8] \n" // B source extraction " vand q7, q5, %[vecMaskR] \n" // B mix " vmla.u16 q6, q7, %[ratio1x8] \n" // G extraction " vand q2, q2, %[vecMaskG] \n" " vshr.u16 q7, q5, #5 \n" " vmul.i16 q2, q2, %[ratio2x4] \n" // G extraction " vand q7, q7, %[vecMaskG] \n" // G mix " vmla.u16 q2, q7, %[ratio1x4] \n" // R extraction " vshr.u16 q4, q4, #11 \n" " vmul.i16 q7, q4, %[ratio2x8] \n" // R extraction " vshr.u16 q5, q5, #11 \n" // R mix " vmla.u16 q7, q5, %[ratio1x8] \n" " vshr.u16 q2, q2, #8 \n" " vldrh.16 q5, [%[scratch]] \n" " vand q2, q2, q5 \n" // vmulq((vecG0 & 0x00fc), 8) " vmul.i16 q2, q2, %[eight] \n" " vshr.u16 q4, q7, #8 \n" // schedule next source load " vldrh.u16 q5, [%[pSource]], #16 \n" " vand q7, q4, %[vecMaskBpck] \n" // pack R & G // vmulq((vecG0 & vecMaskGpck), 8) + vmulq((vecR0 & vecMaskRpck), 256) " vmla.u16 q2, q7, %[twofiftysix] \n" // downshift B ((vecB0 >> 8) >> 3) " vshr.u16 q7, q6, #11 \n" // schedule next target load (pre offset as target not imcrementred so far) " vldrh.u16 q4, [%[pTarget], #16] \n" // pack blue with R&G " vorr q2, q2, q7 \n" " vstrh.16 q2, [%[pTarget]], #16 \n" " letp lr, 2b \n" "1: \n" : [pSource] "+r"(pSource), [pTarget] "+r" (pTarget), [loopCnt] "+r"(loopCnt) : [vecMaskR] "t" (vecMaskR), [vecMaskG] "t" (vecMaskG), [vecMaskBpck] "t" (vecMaskBpck), [ratio1x8] "r" (ratio1x8), [ratio2x8] "r" (ratio2x8), [ratio1x4] "r" (ratio1x4), [ratio2x4] "r" (ratio2x4), [eight] "r" (8), [twofiftysix] "r" (256), [rshft5] "r" (1024), [scratch] "r" (scratch) : "q2", "q4", "q5", "q6", "q7", "memory" ); phwSourceBase += iSourceStride; phwTargetBase += iTargetStride; } while (--row); #endif /* USE_MVE_INTRINSICS */ } __OVERRIDE_WEAK void __arm_2d_impl_rgb565_colour_filling_with_alpha( uint16_t *__RESTRICT pTargetBase, int16_t iTargetStride, arm_2d_size_t *__RESTRICT ptCopySize, uint16_t Colour, uint_fast8_t chRatio) { #ifdef USE_MVE_INTRINSICS int32_t blkCnt; uint16_t ratio1x8 = chRatio * 8; uint16_t ratio1x4 = chRatio * 4; uint16_t ratio2x8 = (256 - chRatio) * 8; uint16_t ratio2x4 = (256 - chRatio) * 4; uint16x8_t vecMaskR = vdupq_n_u16(0x001f); uint16x8_t vecMaskG = vdupq_n_u16(0x003f); uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8); uint16x8_t vecMaskGpck = vdupq_n_u16(0x00fc); uint16x8_t vecIn; uint16x8_t vecColorR, vecColorB, vecColorG; /* unpack color & scale */ vecIn = vdupq_n_u16(Colour); vecColorR = (vecIn & vecMaskR) * ratio1x8; vecColorB = (vecIn >> 11) * ratio1x8; vecColorG = vecIn >> 5; vecColorG = (vecColorG & vecMaskG) * ratio1x4; for (int32_t y = 0; y < ptCopySize->iHeight; y++) { uint16_t *phwTarget = pTargetBase; blkCnt = ptCopySize->iWidth; do { uint16x8_t vecR0, vecB0, vecG0; uint16x8_t vecR1, vecB1, vecG1; /* unpack stream */ vecIn = vld1q(phwTarget); vecR1 = vecIn & vecMaskR; vecB1 = vecIn >> 11; vecG1 = vecIn >> 5; vecG1 = vecG1 & vecMaskG; /* merge */ vecR0 = vecColorR + vecR1 * ratio2x8; vecR0 = vecR0 >> 8; vecG0 = vecColorG + vecG1 * ratio2x4; vecG0 = vecG0 >> 8; vecB0 = vecColorB + vecB1 * ratio2x8; vecB0 = vecB0 >> 8; /* pack */ uint16x8_t vOut = vecR0 >> 3 | vmulq((vecG0 & vecMaskGpck), 8) | vmulq((vecB0 & vecMaskBpck), 256); vst1q(phwTarget, vOut); phwTarget += 8; blkCnt -= 8; } while (blkCnt > 0); pTargetBase += iTargetStride; } #else /* USE_MVE_INTRINSICS */ uint16_t ratio1x8 = chRatio * 8; uint16_t ratio1x4 = chRatio * 4; uint16_t ratio2x8 = (256 - chRatio) * 8; uint16_t ratio2x4 = (256 - chRatio) * 4; uint16x8_t vecMaskR = vdupq_n_u16(0x001f); uint16x8_t vecMaskG = vdupq_n_u16(0x003f); uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8); uint16x8_t vecColorR, vecColorB, vecColorG; uint16x8_t scratch[4]; /* unpack color */ uint16x8_t vecIn = vdupq_n_u16(Colour); vecColorR = vecIn & vecMaskR; vecColorB = vecIn >> 11; vecColorG = vecIn >> 5; vecColorG = vecColorG & vecMaskG; vst1q((uint16_t*)scratch, vecColorR * ratio1x8); vst1q((uint16_t*)&scratch[1], vecColorB * ratio1x8); vst1q((uint16_t*)&scratch[2], vecColorG * ratio1x4); vst1q((uint16_t*)&scratch[3], vdupq_n_u16(0x00fc)); int32_t row = ptCopySize->iHeight; do { uint16_t *phwTarget = pTargetBase; register unsigned loopCnt __asm("lr"); loopCnt = ptCopySize->iWidth; __asm volatile( " vldrh.u16 q4, [%[phwTarget]] \n" " wlstp.16 lr, %[loopCnt], 1f \n" ".p2align 2 \n" "2: \n" // B target extraction " vand q7, q4, %[vecMaskR] \n" " vldrh.u16 q6, [%[scratch]] \n" " vshr.u16 q2, q4, #5 \n" // B mix " vmla.u16 q6, q7, %[ratio2x8] \n" // G extraction " vand q7, q2, %[vecMaskG] \n" // G extraction " vldrh.u16 q2, [%[scratch], #32] \n" // G mix " vmla.u16 q2, q7, %[ratio2x4] \n" " vshr.u16 q4, q4, #11 \n" // R extraction " vldrh.u16 q7, [%[scratch], #16] \n" " vshr.u16 q2, q2, #8 \n" // R mix " vmla.u16 q7, q4, %[ratio2x8] \n" " vshr.u16 q4, q7, #8 \n" // load duplicated 0xfc mask " vldrh.u16 q7, [%[scratch], #48] \n" " vand q2, q2, q7 \n" " vmul.i16 q2, q2, %[eight] \n" " vand q7, q4, %[vecMaskBpck] \n" // pack R & G " vmla.u16 q2, q7, %[twofiftysix] \n" // downshift B ((vecB0 >> 8) >> 3) " vshr.u16 q7, q6, #11 \n" // schedule next target load " vldrh.u16 q4, [%[phwTarget], #16] \n" // pack blue with R&G " vorr q2, q2, q7 \n" " vstrh.16 q2, [%[phwTarget]], #16 \n" " letp lr, 2b \n" "1: \n" : [phwTarget] "+r" (phwTarget), [loopCnt] "+r"(loopCnt) : [vecMaskR] "t" (vecMaskR), [vecMaskG] "t" (vecMaskG), [vecMaskBpck] "t" (vecMaskBpck), [ratio2x8] "r" (ratio2x8), [ratio2x4] "r" (ratio2x4), [eight] "r" (8), [twofiftysix] "r" (256), [scratch] "r" (scratch) : "q2", "q4", "q5", "q6", "q7", "memory" ); pTargetBase += iTargetStride; } while (--row); #endif /* USE_MVE_INTRINSICS */ } __OVERRIDE_WEAK void __arm_2d_impl_rgb565_alpha_blending_colour_masking( uint16_t * __RESTRICT phwSource, int16_t iSourceStride, uint16_t * __RESTRICT phwTarget, int16_t iTargetStride, arm_2d_size_t * __RESTRICT ptCopySize, uint_fast8_t chRatio, uint32_t hwColour) { #ifdef USE_MVE_INTRINSICS uint32_t iHeight = ptCopySize->iHeight; uint32_t iWidth = ptCopySize->iWidth; int32_t blkCnt; uint16_t ratio1x8 = chRatio * 8; uint16_t ratio1x4 = chRatio * 4; uint16_t ratio2x8 = (256 - chRatio) * 8; uint16_t ratio2x4 = (256 - chRatio) * 4; uint16x8_t vecMaskR = vdupq_n_u16(0x001f); uint16x8_t vecMaskG = vdupq_n_u16(0x003f); uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8); uint16x8_t vecMaskGpck = vdupq_n_u16(0x00fc); for (uint32_t y = 0; y < iHeight; y++) { // - inconditional blending + predicated dst update const uint16_t *pSource = phwSource; uint16_t *pTarget = phwTarget; blkCnt = iWidth >> 3; while (blkCnt > 0) { uint16x8_t vecInSrc, vecInDst; uint16x8_t vecR0, vecB0, vecG0; uint16x8_t vecR1, vecB1, vecG1; /* unpack 1st stream */ vecInSrc = vld1q(pSource); vecR0 = vandq(vecInSrc, vecMaskR); vecB0 = vshrq(vecInSrc, 11); vecG0 = vshrq(vecInSrc, 5); vecG0 = vandq(vecG0, vecMaskG); /* unpack 2nd stream */ vecInDst = vld1q(pTarget); vecR1 = vandq(vecInDst, vecMaskR); vecB1 = vshrq(vecInDst, 11); vecG1 = vshrq(vecInDst, 5); vecG1 = vandq(vecG1, vecMaskG); /* merge */ vecR0 = vmlaq(vmulq(vecR0, ratio1x8), vecR1, ratio2x8); vecR0 = vshrq(vecR0, 8); vecG0 = vmlaq(vmulq(vecG0, ratio1x4), vecG1, ratio2x4); vecG0 = vshrq(vecG0, 8); vecB0 = vmlaq(vmulq(vecB0, ratio1x8), vecB1, ratio2x8); vecB0 = vshrq(vecB0, 8); /* pack */ uint16x8_t vOut = vorrq(vshrq(vecR0, 3), vmulq(vandq(vecG0, vecMaskGpck), 8)); vOut = vorrq(vOut, vmulq(vandq(vecB0, vecMaskBpck), 256)); vst1q_p(pTarget, vOut, vcmpneq_n_s16(vecInSrc, hwColour)); pSource += 8; pTarget += 8; blkCnt--; } blkCnt = iWidth & 7; if (blkCnt > 0U) { uint16x8_t vecInSrc, vecInDst; uint16x8_t vecR0, vecB0, vecG0; uint16x8_t vecR1, vecB1, vecG1; /* unpack 1st stream */ vecInSrc = vld1q(pSource); vecR0 = vandq(vecInSrc, vecMaskR); vecB0 = vshrq(vecInSrc, 11); vecG0 = vshrq(vecInSrc, 5); vecG0 = vandq(vecG0, vecMaskG); /* unpack 2nd stream */ vecInDst = vld1q(pTarget); vecR1 = vandq(vecInDst, vecMaskR); vecB1 = vshrq(vecInDst, 11); vecG1 = vshrq(vecInDst, 5); vecG1 = vandq(vecG1, vecMaskG); /* merge */ vecR0 = vmlaq(vmulq(vecR0, ratio1x8), vecR1, ratio2x8); vecR0 = vshrq(vecR0, 8); vecG0 = vmlaq(vmulq(vecG0, ratio1x4), vecG1, ratio2x4); vecG0 = vshrq(vecG0, 8); vecB0 = vmlaq(vmulq(vecB0, ratio1x8), vecB1, ratio2x8); vecB0 = vshrq(vecB0, 8); /* pack */ uint16x8_t vOut = vorrq(vshrq(vecR0, 3), vmulq(vandq(vecG0, vecMaskGpck), 8)); vOut = vorrq(vOut, vmulq(vandq(vecB0, vecMaskBpck), 256)); vst1q_p(pTarget, vOut, vcmpneq_m_n_s16(vecInSrc, hwColour, vctp16q(blkCnt))); } phwSource += iSourceStride; phwTarget += iTargetStride; } #else uint32_t iHeight = ptCopySize->iHeight; uint32_t iWidth = ptCopySize->iWidth; uint16_t ratio1x8 = chRatio * 8; uint16_t ratio1x4 = chRatio * 4; uint16_t ratio2x8 = (256 - chRatio) * 8; uint16_t ratio2x4 = (256 - chRatio) * 4; uint16x8_t vecMaskR = vdupq_n_u16(0x001f); uint16x8_t vecMaskG = vdupq_n_u16(0x003f); uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8); uint16x8_t scratch[1]; vst1q((uint16_t *)scratch, vdupq_n_u16(0x00fc)); for (uint32_t y = 0; y < iHeight; y++) { const uint16_t *pSource = phwSource; uint16_t *pTarget = phwTarget; register unsigned loopCnt __asm("lr"); loopCnt = iWidth; __asm volatile( ".p2align 2 \n" " vldrh.u16 q4, [%[pTarget]] \n" " vldrh.u16 q5, [%[pSource]], #16 \n" " vand q7, q4, %[vecMaskR] \n" " wlstp.16 lr, %[loopCnt], 1f \n" "2: \n" // B target extraction " vshr.u16 q2, q4, #5 \n" " vmul.i16 q6, q7, %[ratio2x8] \n" // B source extraction " vand q7, q5, %[vecMaskR] \n" // B mix " vmla.u16 q6, q7, %[ratio1x8] \n" // G extraction " vand q2, q2, %[vecMaskG] \n" " vshr.u16 q7, q5, #5 \n" " vmul.i16 q2, q2, %[ratio2x4] \n" // G extraction " vand q7, q7, %[vecMaskG] \n" // G mix " vmla.u16 q2, q7, %[ratio1x4] \n" // R extraction " vshr.u16 q4, q4, #11 \n" " vmul.i16 q7, q4, %[ratio2x8] \n" // R extraction " vshr.u16 q5, q5, #11 \n" // R mix " vmla.u16 q7, q5, %[ratio1x8] \n" " vshr.u16 q2, q2, #8 \n" " vldrh.16 q5, [%[scratch]] \n" " vand q2, q2, q5 \n" // vmulq((vecG0 & 0x00fc), 8) " vmul.i16 q2, q2, %[eight] \n" " vshr.u16 q4, q7, #8 \n" // schedule next source load " vldrh.u16 q5, [%[pSource]], #16 \n" " vand q7, q4, %[vecMaskBpck] \n" // pack R & G // vmulq((vecG0 & vecMaskGpck), 8) + vmulq((vecR0 & vecMaskRpck), 256) " vmla.u16 q2, q7, %[twofiftysix] \n" // downshift B ((vecB0 >> 8) >> 3) " vshr.u16 q7, q6, #11 \n" // schedule next target load (pre offset as target not imcrementred so far) " vldrh.u16 q4, [%[pTarget], #16] \n" // pack blue with R&G " vorr q2, q2, q7 \n" " vldrh.u16 q6, [%[pSource], #-32] \n" " vand q7, q4, %[vecMaskR] \n" " vpt.u16 ne, q6, %[hwColour] \n" " vstrht.16 q2, [%[pTarget]], #16 \n" " letp lr, 2b \n" "1: \n" : [pSource] "+r"(pSource), [pTarget] "+r" (pTarget), [loopCnt] "+r"(loopCnt) : [vecMaskR] "t" (vecMaskR), [vecMaskG] "t" (vecMaskG), [vecMaskBpck] "t" (vecMaskBpck), [ratio1x8] "r" (ratio1x8), [ratio2x8] "r" (ratio2x8), [ratio1x4] "r" (ratio1x4), [ratio2x4] "r" (ratio2x4), [eight] "r" (8), [twofiftysix] "r" (256), [hwColour] "r" (hwColour), [scratch] "r" (scratch) : "q2", "q4", "q5", "q6", "q7", "memory" ); phwSource += (iSourceStride); phwTarget += (iTargetStride); } #endif } __OVERRIDE_WEAK void __arm_2d_impl_rgb888_alpha_blending( uint32_t *pwSourceBase, int16_t iSourceStride, uint32_t *pwTargetBase, int16_t iTargetStride, arm_2d_size_t *ptCopySize, uint_fast8_t chRatio) { #ifdef USE_MVE_INTRINSICS uint16_t chRatioCompl = 256 - (uint16_t) chRatio; int32_t blkCnt; int32_t row = ptCopySize->iHeight; while (row > 0) { const uint32_t *pwSource = pwSourceBase; uint32_t *pwTarget = pwTargetBase; /* byte extraction into 16-bit vector */ uint16x8_t vecSrc = vldrbq_u16(pwSource); uint16x8_t vecTrg = vldrbq_u16(pwTarget); pwSource += 2; blkCnt = ptCopySize->iWidth; while (blkCnt > 0) { vstrbq_u16(pwTarget, vmlaq(vmulq(vecSrc, chRatio), vecTrg, chRatioCompl) >> 8); pwTarget += 2; vecSrc = vldrbq_u16(pwSource); vecTrg = vldrbq_u16(pwTarget); pwSource += 2; blkCnt -= 2; } pwSourceBase += iSourceStride; pwTargetBase += iTargetStride; row--; } #else uint16_t chRatioCompl = 256 - (uint16_t) chRatio; register unsigned blkCnt __asm("lr"); int32_t row = ptCopySize->iHeight; while(row > 0) { blkCnt = ptCopySize->iWidth*4; const uint32_t *pwSource = pwSourceBase; uint32_t *pwTarget = pwTargetBase; __asm volatile( " vldrb.u16 q0, [%[pwSource]], #8 \n" " vldrb.u16 q1, [%[pwTarget]] \n" " wlstp.16 lr, %[loopCnt], 1f \n" "2: \n" " vmul.u16 q2, q0, %[chRatio] \n" " vldrb.u16 q0, [%[pwSource]], #8 \n" " vmla.u16 q2, q1, %[chRatioCompl] \n" " vldrb.u16 q1, [%[pwTarget], #8] \n" " vshr.u16 q2, q2, #8 \n" " vstrb.16 q2, [%[pwTarget]], #8 \n" " letp lr, 2b \n" "1: \n" : [pwSource] "+l"(pwSource), [pwTarget] "+l"(pwTarget), [loopCnt] "+r"(blkCnt) : [chRatio] "r" (chRatio), [chRatioCompl] "r" (chRatioCompl) : "q0", "q1", "q2", "memory" ); pwSourceBase += iSourceStride; pwTargetBase += iTargetStride; row--; } #endif } __OVERRIDE_WEAK void __arm_2d_impl_rgb888_colour_filling_with_alpha( uint32_t *__RESTRICT pTargetBase, int16_t iTargetStride, arm_2d_size_t *__RESTRICT ptCopySize, uint32_t Colour, uint_fast8_t chRatio) { #ifdef USE_MVE_INTRINSICS uint16_t chRatioCompl = 256 - (uint16_t) chRatio; int32_t blkCnt; int32_t row = ptCopySize->iHeight; uint32_t scratch[2]; uint16x8_t vColor; scratch[0] = scratch[1] = Colour; vColor = vldrbq_u16((uint8_t *) scratch); vColor = vColor * (uint16_t)chRatio; while (row > 0) { uint32_t *pTarget = pTargetBase; blkCnt = ptCopySize->iWidth; while (blkCnt > 0) { /* byte extraction into 16-bit vector */ uint16x8_t vecTrg = vldrbq_u16((uint8_t *)pTarget); vstrbq_u16((uint8_t *)pTarget, vmlaq(vColor, vecTrg, chRatioCompl) >> 8); pTarget += 2; blkCnt -= 2; } pTargetBase += iTargetStride; row--; } #else /* USE_MVE_INTRINSICS */ uint16_t chRatioCompl = 256 - (uint16_t) chRatio; int32_t blkCnt; int32_t row = ptCopySize->iHeight; uint32_t scratch[2]; uint16x8_t vColor; scratch[0] = scratch[1] = Colour; vColor = vldrbq_u16((uint8_t *) scratch); vColor = vColor * (uint16_t)chRatio; while (row > 0) { uint32_t *pTarget = pTargetBase; blkCnt = ptCopySize->iWidth*4; __asm volatile( /* preload */ " vldrb.u16 q1, [%[pTarget]] \n" " wlstp.16 lr, %[loopCnt], 1f \n" ".p2align 2 \n" "2: \n" " vmov q2, %[vColor] \n" " vmla.u16 q2, q1, %[chRatioCompl] \n" " vldrb.u16 q1, [%[pTarget], #8] \n" " vshr.u16 q2, q2, #8 \n" " vstrb.16 q2, [%[pTarget]], #8 \n" " letp lr, 2b \n" "1: \n" : [pTarget] "+l"(pTarget) : [loopCnt] "r"(blkCnt), [chRatioCompl] "r" (chRatioCompl), [vColor] "t" (vColor) : "q0", "q1", "q2", "memory" ); pTargetBase += iTargetStride; row--; } #endif /* USE_MVE_INTRINSICS */ } __OVERRIDE_WEAK void __arm_2d_impl_rgb888_alpha_blending_colour_masking(uint32_t * __RESTRICT pSourceBase, int16_t iSourceStride, uint32_t * __RESTRICT pTargetBase, int16_t iTargetStride, arm_2d_size_t * __RESTRICT ptCopySize, uint_fast8_t chRatio, uint32_t Colour) { int32_t iHeight = ptCopySize->iHeight; int32_t iWidth = ptCopySize->iWidth; uint16_t chRatioCompl = 256 - chRatio; uint32_t scratch[2]; uint16x8_t vColor; /* color widening */ scratch[0] = scratch[1] = Colour; vColor = vldrbq_u16((uint8_t *) scratch); for (int32_t y = 0; y < iHeight; y++) { int32_t blkCnt = iWidth; const uint32_t *pSource = pSourceBase; uint32_t *pTarget = pTargetBase; uint16x8_t vecSrc, vecTrg; vecSrc = vldrbq_u16((uint8_t const *) pSource); pSource += 2; vecTrg = vldrbq_u16((uint8_t const *) pTarget); pTarget += 2; do { uint16x8_t vecOut; vecOut = vmlaq(vmulq(vecSrc, chRatio), vecTrg, chRatioCompl) >> 8; vecSrc = vldrbq_u16((uint8_t const *) pSource); vecTrg = vldrbq_u16((uint8_t const *) pTarget); // update if (*pSourceBase != Colour) vstrbq_p_u16((uint8_t *)pTarget, vecOut, vcmpneq_u16(vecSrc, vColor)); pSource += 2; pTarget += 2; blkCnt -= 2; } while (blkCnt > 0); pSourceBase += (iSourceStride - iWidth); pTargetBase += (iTargetStride - iWidth); } } __OVERRIDE_WEAK void __arm_2d_impl_rgb565_alpha_blending_direct(const uint16_t *phwSource, const uint16_t *phwBackground, uint16_t *phwDestination, uint32_t wPixelCount, uint_fast8_t chRatio) { #ifdef USE_MVE_INTRINSICS int32_t blkCnt; uint16_t ratio1x8 = chRatio * 8; uint16_t ratio1x4 = chRatio * 4; uint16_t ratio2x8 = (256 - chRatio) * 8; uint16_t ratio2x4 = (256 - chRatio) * 4; uint16x8_t vecMaskR = vdupq_n_u16(0x001f); uint16x8_t vecMaskG = vdupq_n_u16(0x003f); uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8); uint16x8_t vecMaskGpck = vdupq_n_u16(0x00fc); blkCnt = wPixelCount; do { uint16x8_t vecIn; uint16x8_t vecR0, vecB0, vecG0; uint16x8_t vecR1, vecB1, vecG1; /* unpack 1st stream */ vecIn = vld1q(phwSource); phwSource += 8; vecR0 = vecIn & vecMaskR; vecB0 = vecIn >> 11; vecG0 = vecIn >> 5; vecG0 = vecG0 & vecMaskG; /* unpack 2nd stream */ vecIn = vld1q(phwBackground); phwBackground += 8; vecR1 = vecIn & vecMaskR; vecB1 = vecIn >> 11; vecG1 = vecIn >> 5; vecG1 = vecG1 & vecMaskG; /* merge */ vecR0 = vecR0 * ratio1x8 + vecR1 * ratio2x8; vecR0 = vecR0 >> 8; vecG0 = vecG0 * ratio1x4 + vecG1 * ratio2x4; vecG0 = vecG0 >> 8; vecB0 = vecB0 * ratio1x8 + vecB1 * ratio2x8; vecB0 = vecB0 >> 8; /* pack */ uint16x8_t vOut = vecR0 >> 3 | vmulq((vecG0 & vecMaskGpck), 8) | vmulq((vecB0 & vecMaskBpck), 256); vst1q(phwDestination, vOut); phwDestination += 8; blkCnt -= 8; } while (blkCnt > 0); #else /* USE_MVE_INTRINSICS */ uint16_t ratio1x8 = chRatio * 8; uint16_t ratio1x4 = chRatio * 4; uint16_t ratio2x8 = (256 - chRatio) * 8; uint16_t ratio2x4 = (256 - chRatio) * 4; uint16x8_t vecMaskR = vdupq_n_u16(0x001f); uint16x8_t vecMaskG = vdupq_n_u16(0x003f); uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8); register unsigned loopCnt __asm("lr") = (wPixelCount); __asm volatile( " vldrh.u16 q4, [%[in2]], #16 \n" " vmov.i16 q6, #0x00fc \n" " vstrw.32 q6, [sp] \n" " vldrh.u16 q5, [%[in1]], #16 \n" " wlstp.16 lr, %[loopCnt], 1f \n" "2: \n" " vand q6, q4, %[vecMaskR] \n" " vmul.i16 q6, q6, %[ratio2x8] \n" " vshr.u16 q2, q4, #5 \n" " vand q7, q5, %[vecMaskR] \n" " vmla.u16 q6, q7, %[ratio1x8] \n" " vand q2, q2, %[vecMaskG] \n" " vshr.u16 q7, q5, #5 \n" " vmul.i16 q2, q2, %[ratio2x4] \n" " vand q7, q7, %[vecMaskG] \n" " vmla.u16 q2, q7, %[ratio1x4] \n" " vshr.u16 q4, q4, #11 \n" " vmul.i16 q7, q4, %[ratio2x8] \n" " vshr.u16 q5, q5, #11 \n" " vshr.u16 q2, q2, #8 \n" " vmla.u16 q7, q5, %[ratio1x8] \n" // " vmov.i16 q6, #0x00fc \n" " vshr.u16 q7, q7, #8 \n" // " vmov.i16 q6, #0x00fc \n" /* load 0x00fc instead of mov for better overlap opportunity */ " vldrw.u32 q4, [sp] \n" " vand q2, q2, q4 \n" " vmul.i16 q2, q2, %[eight] \n" " vand q4, q7, %[vecMaskBpck] \n" // Q7 = vecB0 " vldrh.u16 q5, [%[in1]], #16 \n" " vmla.u16 q2, q4, %[twofiftysix] \n" // (vecR0 >> 3) >> 8 " vshr.u16 q6, q6, #11 \n" " vldrh.u16 q4, [%[in2]], #16 \n" " vorr q2, q2, q6 \n" " vstrh.16 q2, [%[out]], #16 \n" " letp lr, 2b \n" "1: \n" : [in1] "+r"(phwSource), [in2] "+r"(phwBackground), [out] "+r" (phwDestination), [loopCnt] "+r"(loopCnt) : [vecMaskR] "t" (vecMaskR), [vecMaskG] "t" (vecMaskG), [vecMaskBpck] "t" (vecMaskBpck), [ratio1x8] "r" (ratio1x8), [ratio2x8] "r" (ratio2x8), [ratio1x4] "r" (ratio1x4), [ratio2x4] "r" (ratio2x4), [eight] "r" (8), [twofiftysix] "r" (256) : "q2", "q4", "q5", "q6", "q7", "memory" ); #endif } __OVERRIDE_WEAK void __arm_2d_impl_rgb888_alpha_blending_direct(const uint32_t *pwSource, const uint32_t *pwBackground, uint32_t *pwDestination, uint32_t wPixelCount, uint_fast8_t chRatio) { #ifdef USE_MVE_INTRINSICS int32_t blkCnt; uint16_t chRatioCompl = 256 - (uint16_t) chRatio; uint16x8_t vecSrc, vecBckg; vecSrc = vldrbq_u16((uint8_t const *) pwSource); pwSource += 2; vecBckg = vldrbq_u16((uint8_t const *) pwBackground); pwBackground += 2; blkCnt = wPixelCount; do { uint16x8_t vecOut; vecOut = vmulq_n_u16(vecSrc, (uint16_t) chRatio); vecSrc = vldrbq_u16((uint8_t const *) pwSource); pwSource += 2; vecOut = vmlaq_n_u16(vecOut, vecBckg, chRatioCompl); vecBckg = vldrbq_u16((uint8_t const *) pwBackground); pwBackground += 2; vecOut = vecOut >> 8; vstrbq_u16((uint8_t *) pwDestination, vecOut); pwDestination += 2; blkCnt -= 2; } while (blkCnt > 0); #else /* USE_MVE_INTRINSICS */ uint16_t chRatioCompl = 256 - (uint16_t) chRatio; register unsigned blkCnt __asm("lr") = (wPixelCount * 4); __asm volatile( " vldrb.u16 q0, [%[pwSource]], #8 \n" " vldrb.u16 q1, [%[pwBackg]], #8 \n" " wlstp.16 lr, %[loopCnt], 1f \n" "2: \n" " vmul.u16 q2, q0, %[chRatio] \n" " vldrb.u16 q0, [%[pwSource]], #8 \n" " vmla.u16 q2, q1, %[chRatioCompl] \n" " vldrb.u16 q1, [%[pwBackg]], #8 \n" " vshr.u16 q2, q2, #8 \n" " vstrb.16 q2, [%[pwDest]], #8 \n" " letp lr, 2b \n" "1: \n" : [pwSource] "+l"(pwSource), [pwBackg] "+l"(pwBackground), [pwDest] "+l" (pwDestination), [loopCnt] "+r"(blkCnt) : [chRatio] "r" (chRatio), [chRatioCompl] "r" (chRatioCompl) : "q0", "q1", "q2", "memory" ); #endif } static mve_pred16_t arm_2d_is_point_vec_inside_region_s16(const arm_2d_region_t * ptRegion, const arm_2d_point_s16x8_t * ptPoint) { mve_pred16_t p0 = vcmpgeq(ptPoint->X, ptRegion->tLocation.iX); p0 = vcmpgeq_m(ptPoint->Y, ptRegion->tLocation.iY, p0); p0 = vcmpltq_m(ptPoint->X, ptRegion->tLocation.iX + ptRegion->tSize.iWidth, p0); p0 = vcmpltq_m(ptPoint->Y, ptRegion->tLocation.iY + ptRegion->tSize.iHeight, p0); return p0; } static mve_pred16_t arm_2d_is_point_vec_inside_region_s32(const arm_2d_region_t * ptRegion, const arm_2d_point_s32x4_t * ptPoint) { mve_pred16_t p0 = vcmpgeq_n_s32(ptPoint->X, ptRegion->tLocation.iX); p0 = vcmpgeq_m_n_s32(ptPoint->Y, ptRegion->tLocation.iY, p0); p0 = vcmpltq_m_n_s32(ptPoint->X, ptRegion->tLocation.iX + ptRegion->tSize.iWidth, p0); p0 = vcmpltq_m_n_s32(ptPoint->Y, ptRegion->tLocation.iY + ptRegion->tSize.iHeight, p0); return p0; } static void __arm_2d_impl_rgb565_get_pixel_colour( arm_2d_point_s16x8_t *ptPoint, arm_2d_region_t *ptOrigValidRegion, uint16_t *pOrigin, int16_t iOrigStride, uint16_t *pTarget, uint16_t MaskColour, uint32_t elts) { #if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__ #error "The current version hasn\'t support interpolation in rotation yet." #else /* set vector predicate if point is inside the region */ mve_pred16_t p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, ptPoint); mve_pred16_t predTail = vctp16q(elts); /* prepare vector of point offsets */ uint16x8_t ptOffs = ptPoint->X + ptPoint->Y * iOrigStride; uint16x8_t vPixel = vld1q(pTarget); /* retrieve all point values */ uint16x8_t ptVal = vldrhq_gather_shifted_offset_z_u16(pOrigin, ptOffs, predTail); /* combine 2 predicates set to true if point is in the region & values different from color mask */ vPixel = vpselq_u16(ptVal, vPixel, vcmpneq_m_n_u16(ptVal, MaskColour, p)); vst1q_p(pTarget, vPixel, predTail); #endif } static void __arm_2d_impl_rgb565_get_pixel_colour_offs_compensated( arm_2d_point_s16x8_t *ptPoint, arm_2d_region_t *ptOrigValidRegion, uint16_t *pOrigin, int16_t iOrigStride, uint16_t *pTarget, uint16_t MaskColour, uint32_t elts, int16_t correctionOffset) { #if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__ #error "The current version hasn\'t support interpolation in rotation yet." #else /* set vector predicate if point is inside the region */ mve_pred16_t p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, ptPoint); mve_pred16_t predTail = vctp16q(elts); /* prepare vector of point offsets */ /* correctionOffset avoid 16-bit overflow */ uint16x8_t ptOffs = ptPoint->X + (ptPoint->Y - correctionOffset) * iOrigStride; /* base pointer update to compensate offset */ pOrigin += (correctionOffset * iOrigStride); uint16x8_t vPixel = vld1q(pTarget); /* retrieve all point values */ uint16x8_t ptVal = vldrhq_gather_shifted_offset_z_u16(pOrigin, ptOffs, predTail); /* combine 2 predicates set to true if point is in the region & values different from color mask */ vPixel = vpselq_u16(ptVal, vPixel, vcmpneq_m_n_u16(ptVal, MaskColour, p)); vst1q_p(pTarget, vPixel, predTail); #endif } static void __arm_2d_impl_rgb888_get_pixel_colour_mve( arm_2d_point_s16x8_t *ptPoint, arm_2d_region_t *ptOrigValidRegion, uint32_t *pOrigin, int16_t iOrigStride, uint32_t *pTarget, uint32_t MaskColour, int16_t elts) { #if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__ #error "The current version hasn\'t support interpolation in rotation yet." #else arm_2d_point_s32x4_t tPointLo, tPointHi; ARM_ALIGN(8) int16_t scratch[8]; mve_pred16_t p; /* split 16-bit point vector into 2 x 32-bit vectors */ vst1q(scratch, ptPoint->X); tPointLo.X = vldrhq_s32(scratch); tPointHi.X = vldrhq_s32(scratch + 4); vst1q(scratch, ptPoint->Y); tPointLo.Y = vldrhq_s32(scratch); tPointHi.Y = vldrhq_s32(scratch + 4); /* 1st half */ /* set vector predicate if point is inside the region */ p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointLo); /* prepare vector of point offsets */ uint32x4_t ptOffs = tPointLo.X + tPointLo.Y * iOrigStride; uint32x4_t vPixel = vld1q(pTarget); /* retrieve all point values */ uint32x4_t ptVal = vldrwq_gather_shifted_offset_u32(pOrigin, ptOffs); /* combine 2 predicates set to true if point is in the region & values different from color mask */ vPixel = vpselq_u32(ptVal, vPixel, vcmpneq_m_n_u32(ptVal, MaskColour, p)); vst1q_p(pTarget, vPixel, vctp32q(elts)); elts -= 4; if (elts > 0) { /* second half */ p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointHi); ptOffs = tPointHi.X + tPointHi.Y * iOrigStride; vPixel = vld1q(pTarget + 4); ptVal = vldrwq_gather_shifted_offset_u32(pOrigin, ptOffs); vPixel = vpselq_u32(ptVal, vPixel, vcmpneq_m_n_u32(ptVal, MaskColour, p)); vst1q_p(pTarget + 4, vPixel, vctp32q(elts)); } #endif } static void __arm_2d_impl_rgb565_get_pixel_colour_with_alpha( arm_2d_point_s16x8_t *ptPoint, arm_2d_region_t *ptOrigValidRegion, uint16_t *pOrigin, int16_t iOrigStride, uint16_t *pTarget, uint16_t MaskColour, uint8_t chOpacity, uint32_t elts) { #if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__ #error "The current version hasn\'t support interpolation in rotation yet." #else /* set vector predicate if point is inside the region */ mve_pred16_t p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, ptPoint); /* prepare vector of point offsets */ uint16x8_t ptOffs = ptPoint->X + ptPoint->Y * iOrigStride; uint16x8_t vPixel = vld1q(pTarget); /* retrieve all point values */ uint16x8_t ptVal = vldrhq_gather_shifted_offset_u16(pOrigin, ptOffs); /* alpha blending */ uint16x8_t vBlended = __rgb565_alpha_blending_single_vec(ptVal, vPixel, chOpacity); /* combine 2 predicates, set to true, if point is in the region & values different from color mask */ vPixel = vpselq_u16(vBlended, vPixel, vcmpneq_m_n_u16(ptVal, MaskColour, p)); vst1q_p(pTarget, vPixel, vctp16q(elts)); #endif } static void __arm_2d_impl_rgb565_get_pixel_colour_with_alpha_offs_compensated( arm_2d_point_s16x8_t *ptPoint, arm_2d_region_t *ptOrigValidRegion, uint16_t *pOrigin, int16_t iOrigStride, uint16_t *pTarget, uint16_t MaskColour, uint8_t chOpacity, uint32_t elts, int16_t correctionOffset) { #if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__ #error "The current version hasn\'t support interpolation in rotation yet." #else /* set vector predicate if point is inside the region */ mve_pred16_t p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, ptPoint); /* prepare vector of point offsets */ /* correctionOffset avoid 16-bit overflow */ uint16x8_t ptOffs = ptPoint->X + (ptPoint->Y - correctionOffset) * iOrigStride; mve_pred16_t predTail = vctp16q(elts); uint16x8_t vPixel = vld1q(pTarget); /* retrieve all point values */ /* base pointer update to compensate offset */ pOrigin += (correctionOffset * iOrigStride); uint16x8_t ptVal = vldrhq_gather_shifted_offset_z_u16(pOrigin, ptOffs, predTail); /* alpha blending */ uint16x8_t vBlended = __rgb565_alpha_blending_single_vec(ptVal, vPixel, chOpacity); /* combine 2 predicates, set to true, if point is in the region & values different from color mask */ vPixel = vpselq_u16(vBlended, vPixel, vcmpneq_m_n_u16(ptVal, MaskColour, p)); vst1q_p(pTarget, vPixel, predTail); #endif } static void __arm_2d_impl_rgb888_get_pixel_colour_with_alpha_mve( arm_2d_point_s16x8_t *ptPoint, arm_2d_region_t *ptOrigValidRegion, uint32_t *pOrigin, int16_t iOrigStride, uint32_t *pTarget, uint32_t MaskColour, uint8_t chOpacity, int16_t elts) { #if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__ #error "The current version hasn\'t support interpolation in rotation yet." #else arm_2d_point_s32x4_t tPointLo, tPointHi; ARM_ALIGN(8) int16_t scratch[8]; ARM_ALIGN(8) uint32_t blendled[4]; mve_pred16_t p; /* split 16-bit point vector into 2 x 32-bit vectors */ vst1q(scratch, ptPoint->X); tPointLo.X = vldrhq_s32(scratch); tPointHi.X = vldrhq_s32(scratch + 4); vst1q(scratch, ptPoint->Y); tPointLo.Y = vldrhq_s32(scratch); tPointHi.Y = vldrhq_s32(scratch + 4); /* 1st half */ /* set vector predicate if point is inside the region */ p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointLo); /* prepare vector of point offsets */ uint32x4_t ptOffs = tPointLo.X + tPointLo.Y * iOrigStride; uint32x4_t vPixel = vld1q(pTarget); /* retrieve all point values */ uint32x4_t ptVal = vldrwq_gather_shifted_offset_u32(pOrigin, ptOffs); vstrwq_u32((uint32_t *) scratch, ptVal); /* alpha-blending (requires widened inputs) */ vstrbq_u16((uint8_t *) blendled, __rgb888_alpha_blending_direct_single_vec(vldrbq_u16((uint8_t const *) scratch), vldrbq_u16((uint8_t const *) pTarget), chOpacity)); vstrbq_u16((uint8_t *) blendled + 2, __rgb888_alpha_blending_direct_single_vec(vldrbq_u16((uint8_t const *)scratch + 4), vldrbq_u16((uint8_t const *)pTarget + 2), chOpacity)); uint32x4_t vBlended = vld1q(blendled); /* combine 2 predicates, set to true, if point is in the region & values different from color mask */ vPixel = vpselq_u32(vBlended, vPixel, vcmpneq_m_n_u32(ptVal, MaskColour, p)); vst1q_p(pTarget, vPixel, vctp32q(elts)); elts -= 4; if(elts > 0) { /* second half */ p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointHi); ptOffs = tPointHi.X + tPointHi.Y * iOrigStride; vPixel = vld1q(pTarget); ptVal = vldrwq_gather_shifted_offset_u32(pOrigin, ptOffs); vstrwq_u32((uint32_t *) scratch, ptVal); /* alpha-blending (requires widened inputs) */ vstrbq_u16((uint8_t *) blendled, __rgb888_alpha_blending_direct_single_vec(vldrbq_u16((uint8_t const *) scratch), vldrbq_u16((uint8_t const *) pTarget), chOpacity)); vstrbq_u16((uint8_t *) blendled + 2, __rgb888_alpha_blending_direct_single_vec(vldrbq_u16((uint8_t const *)scratch + 4), vldrbq_u16((uint8_t const *)pTarget + 2), chOpacity)); vBlended = vld1q(blendled); /* combine 2 predicates, set to true, if point is in the region & values different from color mask */ vPixel = vpselq_u32(vBlended, vPixel, vcmpneq_m_n_u32(ptVal, MaskColour, p)); vst1q_p(pTarget + 4, vPixel, vctp32q(elts)); } #endif } #if __ARM_2D_HAS_HELIUM_FLOAT__ \ && !__ARM_2D_CFG_FORCED_FIXED_POINT_ROTATION__ static bool __arm_2d_rotate_regression(arm_2d_size_t * __RESTRICT ptCopySize, arm_2d_location_t * pSrcPoint, float fAngle, arm_2d_location_t * tOffset, arm_2d_location_t * center, int32_t iOrigStride, arm_2d_rot_linear_regr_t regrCoefs[] ) { int32_t iHeight = ptCopySize->iHeight; int32_t iWidth = ptCopySize->iWidth; float invHeight = 1.0f / (float) (iHeight - 1); arm_2d_point_s32x4_t vPointCornerI; int32x4_t vCornerX = { 0, 1, 0, 1 }; int32x4_t vCornerY = { 0, 0, 1, 1 }; float cosAngle = arm_cos_f32(fAngle); float sinAngle = arm_sin_f32(fAngle); arm_2d_point_float_t centerf; float slopeX, slopeY; bool gatherLoadIdxOverflow = 0; centerf.fX = (float) center->iX; centerf.fY = (float) center->iY; vPointCornerI.X = vdupq_n_s32(pSrcPoint->iX + tOffset->iX); vPointCornerI.X = vPointCornerI.X + vmulq_n_s32(vCornerX, (iWidth - 1)); vPointCornerI.Y = vdupq_n_s32(pSrcPoint->iY + tOffset->iY); vPointCornerI.Y = vPointCornerI.Y + vmulq_n_s32(vCornerY, (iHeight - 1)); /* Vector version of: int16_t iX = ptLocation->iX - ptCenter->iX; int16_t iY = ptLocation->iY - ptCenter->iY; float cosAngle = arm_cos_f32(fAngle); float sinAngle = arm_sin_f32(fAngle); ptOutBuffer->fY = (iY * cosAngle + iX * sinAngle + ptCenter->iY); ptOutBuffer->fX = (-iY * sinAngle + iX * cosAngle + ptCenter->iX); */ arm_2d_point_f32x4_t vTmp, vPointCornerF; vTmp.X = vsubq_n_f32(vcvtq_f32_s32(vPointCornerI.X), centerf.fX); vTmp.Y = vsubq_n_f32(vcvtq_f32_s32(vPointCornerI.Y), centerf.fY); vPointCornerF.X = vmulq_n_f32(vTmp.X, cosAngle) - vmulq_n_f32(vTmp.Y, sinAngle); vPointCornerF.X = vaddq_n_f32(vPointCornerF.X, centerf.fX); vPointCornerF.Y = vmulq_n_f32(vTmp.X, sinAngle) + vmulq_n_f32(vTmp.Y, cosAngle); vPointCornerF.Y = vaddq_n_f32(vPointCornerF.Y, centerf.fY); /* Check whether rotated index offsets could exceed 16-bit limits used in subsequent gather loads This will occur for parts of large images (e.g. 320*200) To avoid unconditional penalties for small/medium images, returns a speculative overflow allowing to handle large offsets. */ float32_t maxY = vmaxnmvq(0.0f, vPointCornerF.Y); if((iOrigStride * maxY) > (float)(UINT16_MAX)) gatherLoadIdxOverflow = true; /* interpolation in Y direction for 1st elements column */ slopeX = (vPointCornerF.X[2] - vPointCornerF.X[0]) * invHeight; slopeY = (vPointCornerF.Y[2] - vPointCornerF.Y[0]) * invHeight; regrCoefs[0].slopeY = slopeY; regrCoefs[0].slopeX = slopeX; regrCoefs[0].interceptY = vPointCornerF.Y[0]; regrCoefs[0].interceptX = vPointCornerF.X[0]; /* interpolation in Y direction for the last elements column */ slopeX = (vPointCornerF.X[3] - vPointCornerF.X[1]) * invHeight; slopeY = (vPointCornerF.Y[3] - vPointCornerF.Y[1]) * invHeight; regrCoefs[1].slopeY = slopeY; regrCoefs[1].slopeX = slopeX; regrCoefs[1].interceptY = vPointCornerF.Y[1]; regrCoefs[1].interceptX = vPointCornerF.X[1]; return gatherLoadIdxOverflow; } __OVERRIDE_WEAK void __arm_2d_impl_rgb565_rotate( __arm_2d_param_copy_orig_t *ptParam, __arm_2d_rotate_info_t *ptInfo) { int32_t iHeight = ptParam->use_as____arm_2d_param_copy_t.tCopySize.iHeight; int32_t iWidth = ptParam->use_as____arm_2d_param_copy_t.tCopySize.iWidth; int32_t iTargetStride = ptParam->use_as____arm_2d_param_copy_t.tTarget.iStride; uint16_t *pTargetBase = ptParam->use_as____arm_2d_param_copy_t.tTarget.pBuffer; uint16_t *pOrigin = ptParam->tOrigin.pBuffer; int32_t iOrigStride = ptParam->tOrigin.iStride; uint16_t MaskColour = ptInfo->Mask.hwColour; float fAngle = -ptInfo->fAngle; arm_2d_location_t tOffset = ptParam->use_as____arm_2d_param_copy_t.tSource.tValidRegion.tLocation; arm_2d_location_t *pCenter = &(ptInfo->tCenter); float invIWidth = 1.0f / (float) (iWidth - 1); arm_2d_rot_linear_regr_t regrCoefs[2]; arm_2d_location_t SrcPt = ptInfo->tDummySourceOffset; bool gatherLoadIdxOverflow; /* get regression parameters over 1st and last column */ gatherLoadIdxOverflow = __arm_2d_rotate_regression(&ptParam->use_as____arm_2d_param_copy_t.tCopySize, &SrcPt, fAngle, &tOffset, pCenter, iOrigStride, regrCoefs); /* slopes between 1st and last columns */ float16_t slopeY, slopeX; slopeY = (float16_t) (regrCoefs[1].interceptY - regrCoefs[0].interceptY) * invIWidth; slopeX = (float16_t) (regrCoefs[1].interceptX - regrCoefs[0].interceptX) * invIWidth; if (!gatherLoadIdxOverflow) { for (int32_t y = 0; y < iHeight; y++) { /* 1st column estimates (intercepts for regression in X direction */ float16_t colFirstY = regrCoefs[0].slopeY * y + regrCoefs[0].interceptY; float16_t colFirstX = regrCoefs[0].slopeX * y + regrCoefs[0].interceptX; int32_t nbVecElts = iWidth; float16x8_t vX = vcvtq_f16_s16((int16x8_t) vidupq_n_u16(0, 1)); uint16_t *pTargetBaseCur = pTargetBase; while (nbVecElts > 0) { arm_2d_point_s16x8_t tPointV; tPointV.X = vcvtq_s16_f16(vfmaq_n_f16(vdupq_n_f16(colFirstX), vX, slopeX)); tPointV.Y = vcvtq_s16_f16(vfmaq_n_f16(vdupq_n_f16(colFirstY), vX, slopeY)); __arm_2d_impl_rgb565_get_pixel_colour(&tPointV, &ptParam->tOrigin.tValidRegion, pOrigin, iOrigStride, pTargetBaseCur, MaskColour, nbVecElts); pTargetBaseCur += 8; vX += 8.0f16; nbVecElts -= 8; } pTargetBase += iTargetStride; } } else { for (int32_t y = 0; y < iHeight; y++) { /* 1st column estimates (intercepts for regression in X direction */ float16_t colFirstY = regrCoefs[0].slopeY * y + regrCoefs[0].interceptY; float16_t colFirstX = regrCoefs[0].slopeX * y + regrCoefs[0].interceptX; int32_t nbVecElts = iWidth; float16x8_t vX = vcvtq_f16_s16((int16x8_t) vidupq_n_u16(0, 1)); uint16_t *pTargetBaseCur = pTargetBase; while (nbVecElts > 0) { arm_2d_point_s16x8_t tPointV; tPointV.X = vcvtq_s16_f16(vfmaq_n_f16(vdupq_n_f16(colFirstX), vX, slopeX)); tPointV.Y = vcvtq_s16_f16(vfmaq_n_f16(vdupq_n_f16(colFirstY), vX, slopeY)); /* get Y minimum, subtract 1 to compensate negative X, as gather load index cannot be negative */ int16_t correctionOffset = vminvq_s16(0x7fff, tPointV.Y) - 1; __arm_2d_impl_rgb565_get_pixel_colour_offs_compensated(&tPointV, &ptParam->tOrigin. tValidRegion, pOrigin, iOrigStride, pTargetBaseCur, MaskColour, nbVecElts, correctionOffset); pTargetBaseCur += 8; vX += 8.0f16; nbVecElts -= 8; } pTargetBase += iTargetStride; } } } /* untested */ __OVERRIDE_WEAK void __arm_2d_impl_rgb888_rotate( __arm_2d_param_copy_orig_t *ptParam, __arm_2d_rotate_info_t *ptInfo) { int32_t iHeight = ptParam->use_as____arm_2d_param_copy_t.tCopySize.iHeight; int32_t iWidth = ptParam->use_as____arm_2d_param_copy_t.tCopySize.iWidth; int32_t iTargetStride = ptParam->use_as____arm_2d_param_copy_t.tTarget.iStride; uint32_t *pTargetBase = ptParam->use_as____arm_2d_param_copy_t.tTarget.pBuffer; uint32_t *pOrigin = ptParam->tOrigin.pBuffer; int32_t iOrigStride = ptParam->tOrigin.iStride; uint32_t MaskColour = ptInfo->Mask.hwColour; float fAngle = -ptInfo->fAngle; arm_2d_location_t tOffset = ptParam->use_as____arm_2d_param_copy_t.tSource.tValidRegion.tLocation; arm_2d_location_t *pCenter = &(ptInfo->tCenter); float invIWidth = 1.0f / (float) (iWidth - 1); arm_2d_rot_linear_regr_t regrCoefs[2]; arm_2d_location_t SrcPt = ptInfo->tDummySourceOffset; /* get regression parameters over 1st and last column */ __arm_2d_rotate_regression(&ptParam->use_as____arm_2d_param_copy_t.tCopySize, &SrcPt, fAngle, &tOffset, pCenter, iOrigStride, regrCoefs); /* slopes between 1st and last columns */ float16_t slopeY, slopeX; slopeY = (float16_t)(regrCoefs[1].interceptY - regrCoefs[0].interceptY) * invIWidth; slopeX = (float16_t)(regrCoefs[1].interceptX - regrCoefs[0].interceptX) * invIWidth; for (int32_t y = 0; y < iHeight; y++) { /* 1st column estimates (intercepts for regression in X direction */ float16_t colFirstY = regrCoefs[0].slopeY * y + regrCoefs[0].interceptY; float16_t colFirstX = regrCoefs[0].slopeX * y + regrCoefs[0].interceptX; int32_t nbVecElts = iWidth; float16x8_t vX = vcvtq_f16_s16((int16x8_t) vidupq_n_u16(0, 1)); uint32_t *pTargetBaseCur = pTargetBase; while (nbVecElts > 0) { arm_2d_point_s16x8_t tPointV; tPointV.X = vcvtq_s16_f16( vfmaq_n_f16(vdupq_n_f16(colFirstX), vX, slopeX)); tPointV.Y = vcvtq_s16_f16( vfmaq_n_f16(vdupq_n_f16(colFirstY), vX, slopeY)); __arm_2d_impl_rgb888_get_pixel_colour_mve(&tPointV, &ptParam->tOrigin.tValidRegion, pOrigin, iOrigStride, pTargetBase, MaskColour, nbVecElts); pTargetBaseCur += 8; vX += 8.0f16; nbVecElts -= 8; } pTargetBase += iTargetStride; } } __OVERRIDE_WEAK void __arm_2d_impl_rgb565_rotate_alpha( __arm_2d_param_copy_orig_t *ptParam, __arm_2d_rotate_info_t *ptInfo, uint_fast8_t chRatio) { int32_t iHeight = ptParam->use_as____arm_2d_param_copy_t.tCopySize.iHeight; int32_t iWidth = ptParam->use_as____arm_2d_param_copy_t.tCopySize.iWidth; int32_t iTargetStride = ptParam->use_as____arm_2d_param_copy_t.tTarget.iStride; uint16_t *pTargetBase = ptParam->use_as____arm_2d_param_copy_t.tTarget.pBuffer; uint16_t *pOrigin = ptParam->tOrigin.pBuffer; int32_t iOrigStride = ptParam->tOrigin.iStride; uint16_t MaskColour = ptInfo->Mask.hwColour; float fAngle = -ptInfo->fAngle; arm_2d_location_t tOffset = ptParam->use_as____arm_2d_param_copy_t.tSource.tValidRegion.tLocation; arm_2d_location_t *pCenter = &(ptInfo->tCenter); uint16_t hwRatioCompl = 256 - chRatio; float invIWidth = 1.0f / (float) (iWidth - 1); arm_2d_rot_linear_regr_t regrCoefs[2]; arm_2d_location_t SrcPt = ptInfo->tDummySourceOffset; bool gatherLoadIdxOverflow; /* get regression parameters over 1st and last column */ gatherLoadIdxOverflow = __arm_2d_rotate_regression(&ptParam->use_as____arm_2d_param_copy_t.tCopySize, &SrcPt, fAngle, &tOffset, pCenter, iOrigStride, regrCoefs); /* slopes between 1st and last columns */ float16_t slopeY, slopeX; slopeY = (float16_t) (regrCoefs[1].interceptY - regrCoefs[0].interceptY) * invIWidth; slopeX = (float16_t) (regrCoefs[1].interceptX - regrCoefs[0].interceptX) * invIWidth; if (!gatherLoadIdxOverflow) { for (int32_t y = 0; y < iHeight; y++) { /* 1st column estimates (intercepts for regression in X direction */ float16_t colFirstY = (float16_t) (regrCoefs[0].slopeY * y + regrCoefs[0].interceptY); float16_t colFirstX = (float16_t) (regrCoefs[0].slopeX * y + regrCoefs[0].interceptX); int32_t nbVecElts = iWidth; float16x8_t vX = vcvtq_f16_s16((int16x8_t) vidupq_n_u16(0, 1)); uint16_t *pTargetBaseCur = pTargetBase; while (nbVecElts > 0) { arm_2d_point_s16x8_t tPointV; /* linear interpolation thru first & last columns */ tPointV.X = vcvtq_s16_f16(vfmaq_n_f16(vdupq_n_f16(colFirstX), vX, slopeX)); tPointV.Y = vcvtq_s16_f16(vfmaq_n_f16(vdupq_n_f16(colFirstY), vX, slopeY)); __arm_2d_impl_rgb565_get_pixel_colour_with_alpha(&tPointV, &ptParam->tOrigin. tValidRegion, pOrigin, iOrigStride, pTargetBaseCur, MaskColour, hwRatioCompl, nbVecElts); pTargetBaseCur += 8; vX += 8.0f16; nbVecElts -= 8; } pTargetBase += iTargetStride; } } else { /* Large image / Large origin offsets Gather load 16-bit could overflow - Y offset needs to be shifted down to avoid overflow - 16-bit gather loads base address is incremented Needs to be done in the inner loop. In the case of steep slopes, taking the minimum between the Y extrema could still generate overflows */ for (int32_t y = 0; y < iHeight; y++) { /* 1st column estimates (intercepts for regression in X direction */ float16_t colFirstY = (float16_t) (regrCoefs[0].slopeY * y + regrCoefs[0].interceptY); float16_t colFirstX = (float16_t) (regrCoefs[0].slopeX * y + regrCoefs[0].interceptX); int32_t nbVecElts = iWidth; float16x8_t vX = vcvtq_f16_s16((int16x8_t) vidupq_n_u16(0, 1)); uint16_t *pTargetBaseCur = pTargetBase; while (nbVecElts > 0) { arm_2d_point_s16x8_t tPointV; /* linear interpolation thru first & last columns */ tPointV.X = vcvtq_s16_f16(vfmaq_n_f16(vdupq_n_f16(colFirstX), vX, slopeX)); tPointV.Y = vcvtq_s16_f16(vfmaq_n_f16(vdupq_n_f16(colFirstY), vX, slopeY)); /* get Y minimum, subtract 1 to compensate negative X, as gather load index cannot be negative */ int16_t correctionOffset = vminvq_s16(0x7fff, tPointV.Y) - 1; __arm_2d_impl_rgb565_get_pixel_colour_with_alpha_offs_compensated(&tPointV, &ptParam->tOrigin. tValidRegion, pOrigin, iOrigStride, pTargetBaseCur, MaskColour, hwRatioCompl, nbVecElts, correctionOffset); pTargetBaseCur += 8; vX += 8.0f16; nbVecElts -= 8; } pTargetBase += iTargetStride; } } } /* untested */ __OVERRIDE_WEAK void __arm_2d_impl_rgb888_rotate_alpha( __arm_2d_param_copy_orig_t *ptParam, __arm_2d_rotate_info_t *ptInfo, uint_fast8_t chRatio) { int32_t iHeight = ptParam->use_as____arm_2d_param_copy_t.tCopySize.iHeight; int32_t iWidth = ptParam->use_as____arm_2d_param_copy_t.tCopySize.iWidth; int32_t iTargetStride = ptParam->use_as____arm_2d_param_copy_t.tTarget.iStride; uint32_t *pTargetBase = ptParam->use_as____arm_2d_param_copy_t.tTarget.pBuffer; uint32_t *pOrigin = ptParam->tOrigin.pBuffer; int32_t iOrigStride = ptParam->tOrigin.iStride; uint32_t MaskColour = ptInfo->Mask.hwColour; float fAngle = -ptInfo->fAngle; arm_2d_location_t tOffset = ptParam->use_as____arm_2d_param_copy_t.tSource.tValidRegion.tLocation; uint16_t wRatioCompl = 256 - chRatio; arm_2d_location_t *pCenter = &(ptInfo->tCenter); float invIWidth = 1.0f / (float) (iWidth - 1); arm_2d_rot_linear_regr_t regrCoefs[2]; arm_2d_location_t SrcPt = ptInfo->tDummySourceOffset; /* get regression parameters over 1st and last column */ __arm_2d_rotate_regression(&ptParam->use_as____arm_2d_param_copy_t.tCopySize, &SrcPt, fAngle, &tOffset, pCenter, iOrigStride, regrCoefs); /* slopes between 1st and last columns */ float16_t slopeY, slopeX; slopeY = (float16_t)(regrCoefs[1].interceptY - regrCoefs[0].interceptY) * invIWidth; slopeX = (float16_t)(regrCoefs[1].interceptX - regrCoefs[0].interceptX) * invIWidth; for (int32_t y = 0; y < iHeight; y++) { /* 1st column estimates (intercepts for regression in X direction */ float16_t colFirstY = regrCoefs[0].slopeY * y + regrCoefs[0].interceptY; float16_t colFirstX = regrCoefs[0].slopeX * y + regrCoefs[0].interceptX; int32_t nbVecElts = iWidth; float16x8_t vX = vcvtq_f16_s16((int16x8_t) vidupq_n_u16(0, 1)); uint32_t *pTargetBaseCur = pTargetBase; while (nbVecElts > 0) { arm_2d_point_s16x8_t tPointV; tPointV.X = vcvtq_s16_f16( vfmaq_n_f16(vdupq_n_f16(colFirstX), vX, slopeX)); tPointV.Y = vcvtq_s16_f16( vfmaq_n_f16(vdupq_n_f16(colFirstY), vX, slopeY)); __arm_2d_impl_rgb888_get_pixel_colour_with_alpha_mve(&tPointV, &ptParam-> tOrigin.tValidRegion, pOrigin, iOrigStride, pTargetBase, MaskColour, wRatioCompl, nbVecElts); pTargetBaseCur += 8; vX += 8.0f16; nbVecElts -= 8; } pTargetBase += iTargetStride; } } #else /* __ARM_2D_HAS_HELIUM_FLOAT__ && ! __ARM_2D_CFG_FORCED_FIXED_POINT_ROTATION__ */ #define ONE_BY_2PI_Q31 341782637.0f #define ARSHIFT(x, shift) (shift > 0 ? x >> shift : x << (-shift)) #define TO_Q16(x) ((x) << 16) #ifdef VECTORIZED_ROTATION_REGR /* disabled as slower than scalar */ static bool __arm_2d_rotate_regression(arm_2d_size_t * __RESTRICT ptCopySize, arm_2d_location_t * pSrcPoint, float fAngle, arm_2d_location_t * tOffset, arm_2d_location_t * center, int32_t iOrigStride, arm_2d_rot_linear_regr_t regrCoefs[] ) { int32_t iHeight = ptCopySize->iHeight; int32_t iWidth = ptCopySize->iWidth; q31_t invHeightFx = 0x7fffffff / (iHeight - 1); arm_2d_point_s32x4_t vPointCornerI; int32_t AngleFx = (int32_t) roundf(fAngle * ONE_BY_2PI_Q31); q31_t cosAngleFx = arm_cos_q31(AngleFx); q31_t sinAngleFx = arm_sin_q31(AngleFx); int32x4_t vCornerX = { 0, 1, 0, 1 }; int32x4_t vCornerY = { 0, 0, 1, 1 }; bool gatherLoadIdxOverflow = 0; vPointCornerI.X = vdupq_n_s32(pSrcPoint->iX + tOffset->iX); vPointCornerI.X = vPointCornerI.X + vmulq_n_s32(vCornerX, (iWidth - 1)); vPointCornerI.Y = vdupq_n_s32(pSrcPoint->iY + tOffset->iY); vPointCornerI.Y = vPointCornerI.Y + vmulq_n_s32(vCornerY, (iHeight - 1)); /* Vector version of: int16_t iX = ptLocation->iX - ptCenter->iX; int16_t iY = ptLocation->iY - ptCenter->iY; q31_t cosAngleFx = arm_cos_q31(fAngle); q31_t sinAngleFx = arm_sin_q31(fAngle); tPointCornerFx[0][0].Y = qdadd(qdadd(centerQ16.Y, MULTFX(iYQ16, cosAngleFx)), MULTFX(iXQ16, sinAngleFx)); tPointCornerFx[0][0].X = qdsub(qdadd(centerQ16.X, MULTFX(iXQ16, cosAngleFx)), MULTFX(iYQ16, sinAngleFx)); */ arm_2d_point_s32x4_t vTmp1; vTmp1.X = vsubq_n_s16(vPointCornerI.X, center->iX); vTmp1.Y = vsubq_n_s16(vPointCornerI.Y, center->iY); vTmp1.X <<= 16; vTmp1.Y <<= 16; vPointCornerI.X = vqsubq(vqdmulhq_n_s32(vTmp1.X, cosAngleFx), vqdmulhq_n_s32(vTmp1.Y, sinAngleFx)); vPointCornerI.X = vqaddq_n_s32(vPointCornerI.X, (center->iX << 16)); vPointCornerI.Y = vqdmlahq(vqdmulhq_n_s32(vTmp1.X, sinAngleFx), vTmp1.Y, cosAngleFx); vPointCornerI.Y = vqaddq_n_s32(vPointCornerI.Y, (center->iY << 16)); /* Check whether rotated index offsets could exceed 16-bit limits used in subsequent gather loads This will occur for parts of large images (e.g. 320*200) To avoid unconditional penalties for small/medium images, returns a speculative overflow allowing to handle large offsets. */ int32_t maxY = vmaxvq(0.0f, vPointCornerI.Y); if(MULTFX(TO_Q16(iOrigStride), maxY) > UINT16_MAX) gatherLoadIdxOverflow = true; /* regression parameters */ vTmp1.X[0] = vPointCornerI.X[0]; vTmp1.X[1] = vPointCornerI.X[1]; vTmp1.X[2] = vPointCornerI.Y[0]; vTmp1.X[3] = vPointCornerI.Y[1]; vTmp1.Y[0] = vPointCornerI.X[2]; vTmp1.Y[1] = vPointCornerI.X[3]; vTmp1.Y[2] = vPointCornerI.Y[2]; vTmp1.Y[3] = vPointCornerI.Y[3]; /* slopes */ vTmp1.X = vqdmulhq_n_s32(vTmp1.Y - vTmp1.X, invHeightFx); regrCoefs[0].slopeY = vTmp1.X[2]; regrCoefs[0].slopeX = vTmp1.X[0]; regrCoefs[0].interceptY = vPointCornerI.Y[0]; regrCoefs[0].interceptX = vPointCornerI.X[0]; regrCoefs[1].slopeY = vTmp1.X[3]; regrCoefs[1].slopeX = vTmp1.X[1]; regrCoefs[1].interceptY = vPointCornerI.Y[1]; regrCoefs[1].interceptX = vPointCornerI.X[1]; return gatherLoadIdxOverflow; } #else static bool __arm_2d_rotate_regression(arm_2d_size_t * __RESTRICT ptCopySize, arm_2d_location_t * pSrcPoint, float fAngle, arm_2d_location_t * tOffset, arm_2d_location_t * center, int32_t iOrigStride, arm_2d_rot_linear_regr_t regrCoefs[] ) { int_fast16_t iHeight = ptCopySize->iHeight; int_fast16_t iWidth = ptCopySize->iWidth; q31_t invHeightFx = 0x7fffffff / (iHeight - 1); int32_t AngleFx = lroundf(fAngle * ONE_BY_2PI_Q31); q31_t cosAngleFx = arm_cos_q31(AngleFx); q31_t sinAngleFx = arm_sin_q31(AngleFx); arm_2d_point_fx_t tPointCornerFx[2][2]; arm_2d_point_fx_t centerQ16; arm_2d_point_fx_t srcPointQ16; arm_2d_point_fx_t tOffsetQ16; arm_2d_point_fx_t tmp; int32_t iXQ16, iYQ16; bool gatherLoadIdxOverflow = 0; /* Q16 conversion */ centerQ16.X = TO_Q16(center->iX); centerQ16.Y = TO_Q16(center->iY); srcPointQ16.X = TO_Q16(pSrcPoint->iX); srcPointQ16.Y = TO_Q16(pSrcPoint->iY); tOffsetQ16.X = TO_Q16(tOffset->iX); tOffsetQ16.Y = TO_Q16(tOffset->iY); /* (0,0) corner */ tmp.X = srcPointQ16.X + 0 + tOffsetQ16.X; tmp.Y = srcPointQ16.Y + 0 + tOffsetQ16.Y; iXQ16 = tmp.X - centerQ16.X; iYQ16 = tmp.Y - centerQ16.Y; tPointCornerFx[0][0].Y = qdadd(qdadd(centerQ16.Y, MULTFX(iYQ16, cosAngleFx)), MULTFX(iXQ16, sinAngleFx)); tPointCornerFx[0][0].X = qdsub(qdadd(centerQ16.X, MULTFX(iXQ16, cosAngleFx)), MULTFX(iYQ16, sinAngleFx)); /* ((iWidth - 1),0) corner */ tmp.X = srcPointQ16.X + 0 + tOffsetQ16.X + TO_Q16(iWidth - 1); iXQ16 = tmp.X - centerQ16.X; tPointCornerFx[1][0].Y = qdadd(qdadd(centerQ16.Y, MULTFX(iYQ16, cosAngleFx)), MULTFX(iXQ16, sinAngleFx)); tPointCornerFx[1][0].X = qdsub(qdadd(centerQ16.X, MULTFX(iXQ16, cosAngleFx)), MULTFX(iYQ16, sinAngleFx)); /* ((iWidth - 1),(iHeight - 1)) corner */ tmp.Y = srcPointQ16.Y + tOffsetQ16.Y + TO_Q16(iHeight - 1); iYQ16 = tmp.Y - centerQ16.Y; tPointCornerFx[1][1].Y = qdadd(qdadd(centerQ16.Y, MULTFX(iYQ16, cosAngleFx)), MULTFX(iXQ16, sinAngleFx)); tPointCornerFx[1][1].X = qdsub(qdadd(centerQ16.X, MULTFX(iXQ16, cosAngleFx)), MULTFX(iYQ16, sinAngleFx)); /* (0,(iHeight - 1)) corner */ tmp.X = srcPointQ16.X + 0 + tOffsetQ16.X; iXQ16 = tmp.X - centerQ16.X; tPointCornerFx[0][1].Y = qdadd(qdadd(centerQ16.Y, MULTFX(iYQ16, cosAngleFx)), MULTFX(iXQ16, sinAngleFx)); tPointCornerFx[0][1].X = qdsub(qdadd(centerQ16.X, MULTFX(iXQ16, cosAngleFx)), MULTFX(iYQ16, sinAngleFx)); /* Check whether rotated index offsets could exceed 16-bit limits used in subsequent gather loads This will occur for parts of large images (e.g. 320*200) To avoid unconditional penalties for small/medium images, returns a speculative overflow allowing to handle large offsets. */ int32_t maxY = MAX(MAX (MAX(tPointCornerFx[0][0].Y, tPointCornerFx[0][1].Y), tPointCornerFx[1][0].Y), tPointCornerFx[1][1].Y); if(MULTFX(TO_Q16(iOrigStride), maxY) > UINT16_MAX) gatherLoadIdxOverflow = true; /* regression */ int32_t slopeXFx, slopeYFx; /* interpolation in Y direction for 1st elements column */ slopeXFx = MULTFX((tPointCornerFx[0][1].X - tPointCornerFx[0][0].X), invHeightFx); slopeYFx = MULTFX((tPointCornerFx[0][1].Y - tPointCornerFx[0][0].Y), invHeightFx); regrCoefs[0].slopeY = slopeYFx * 2; regrCoefs[0].slopeX = slopeXFx * 2; regrCoefs[0].interceptY = tPointCornerFx[0][0].Y; regrCoefs[0].interceptX = tPointCornerFx[0][0].X; /* interpolation in Y direction for the last elements column */ slopeXFx = MULTFX((tPointCornerFx[1][1].X - tPointCornerFx[1][0].X), invHeightFx); slopeYFx = MULTFX((tPointCornerFx[1][1].Y - tPointCornerFx[1][0].Y), invHeightFx); regrCoefs[1].slopeY = slopeYFx* 2; regrCoefs[1].slopeX = slopeXFx* 2; regrCoefs[1].interceptY = tPointCornerFx[1][0].Y; regrCoefs[1].interceptX = tPointCornerFx[1][0].X; return gatherLoadIdxOverflow; } #endif __OVERRIDE_WEAK void __arm_2d_impl_rgb565_rotate( __arm_2d_param_copy_orig_t *ptParam, __arm_2d_rotate_info_t *ptInfo) { int32_t iHeight = ptParam->use_as____arm_2d_param_copy_t.tCopySize.iHeight; int32_t iWidth = ptParam->use_as____arm_2d_param_copy_t.tCopySize.iWidth; int32_t iTargetStride = ptParam->use_as____arm_2d_param_copy_t.tTarget.iStride; uint16_t *pTargetBase = ptParam->use_as____arm_2d_param_copy_t.tTarget.pBuffer; uint16_t *pOrigin = ptParam->tOrigin.pBuffer; int32_t iOrigStride = ptParam->tOrigin.iStride; uint16_t MaskColour = ptInfo->Mask.hwColour; float fAngle = -ptInfo->fAngle; arm_2d_location_t tOffset = ptParam->use_as____arm_2d_param_copy_t.tSource.tValidRegion.tLocation; arm_2d_location_t *pCenter = &(ptInfo->tCenter); q31_t invIWidth = 0x7fffffff / (iWidth - 1); arm_2d_rot_linear_regr_t regrCoefs[2]; arm_2d_location_t SrcPt = ptInfo->tDummySourceOffset; bool gatherLoadIdxOverflow; /* get regression parameters over 1st and last column */ gatherLoadIdxOverflow = __arm_2d_rotate_regression(&ptParam->use_as____arm_2d_param_copy_t.tCopySize, &SrcPt, fAngle, &tOffset, pCenter, iOrigStride, regrCoefs); /* slopes between 1st and last columns */ int32_t slopeY, slopeX; slopeY = MULTFX((regrCoefs[1].interceptY - regrCoefs[0].interceptY), invIWidth); slopeX = MULTFX((regrCoefs[1].interceptX - regrCoefs[0].interceptX), invIWidth); int32_t nrmSlopeX = 17 - __CLZ(ABS(slopeX)); int32_t nrmSlopeY = 17 - __CLZ(ABS(slopeY)); slopeX = ARSHIFT(slopeX, nrmSlopeX); slopeY = ARSHIFT(slopeY, nrmSlopeY); if (!gatherLoadIdxOverflow) { for (int32_t y = 0; y < iHeight; y++) { /* 1st column estimates */ int32_t colFirstY = qadd((regrCoefs[0].slopeY * y), regrCoefs[0].interceptY); int32_t colFirstX = qadd((regrCoefs[0].slopeX * y), regrCoefs[0].interceptX); /* Q6 conversion */ colFirstX = colFirstX >> 10; colFirstY = colFirstY >> 10; int32_t nbVecElts = iWidth; int16x8_t vX = (int16x8_t) vidupq_n_u16(0, 1); uint16_t *pTargetBaseCur = pTargetBase; /* Q9.6 coversion */ vX = vX * (1<<6); while (nbVecElts > 0) { arm_2d_point_s16x8_t tPointV;; int16x8_t vtmp; vtmp = vqdmulhq_n_s16(vX, slopeX); vtmp = vaddq_n_s16(vqrshlq_n_s16(vtmp, nrmSlopeX), colFirstX); tPointV.X = vtmp >> 6; vtmp = vqdmulhq_n_s16(vX, slopeY); vtmp = vaddq_n_s16(vqrshlq_n_s16(vtmp, nrmSlopeY), colFirstY); tPointV.Y = vtmp >> 6; __arm_2d_impl_rgb565_get_pixel_colour(&tPointV, &ptParam->tOrigin.tValidRegion, pOrigin, iOrigStride, pTargetBaseCur, MaskColour, nbVecElts); pTargetBaseCur += 8; vX += ((1<<6) * 8); nbVecElts -= 8; } pTargetBase += iTargetStride; } } else { for (int32_t y = 0; y < iHeight; y++) { /* 1st column estimates */ int32_t colFirstY = qadd((regrCoefs[0].slopeY * y), regrCoefs[0].interceptY); int32_t colFirstX = qadd((regrCoefs[0].slopeX * y), regrCoefs[0].interceptX); /* Q6 conversion */ colFirstX = colFirstX >> 10; colFirstY = colFirstY >> 10; int32_t nbVecElts = iWidth; int16x8_t vX = (int16x8_t) vidupq_n_u16(0, 1); uint16_t *pTargetBaseCur = pTargetBase; /* Q9.6 coversion */ vX = vX * (1 << 6); while (nbVecElts > 0) { arm_2d_point_s16x8_t tPointV;; int16x8_t vtmp; vtmp = vqdmulhq_n_s16(vX, slopeX); vtmp = vaddq_n_s16(vqrshlq_n_s16(vtmp, nrmSlopeX), colFirstX); tPointV.X = vtmp >> 6; vtmp = vqdmulhq_n_s16(vX, slopeY); vtmp = vaddq_n_s16(vqrshlq_n_s16(vtmp, nrmSlopeY), colFirstY); tPointV.Y = vtmp >> 6; /* get Y minimum, subtract 1 to compensate negative X, as gather load index cannot be negative */ int16_t correctionOffset = vminvq_s16(0x7fff, tPointV.Y) - 1; __arm_2d_impl_rgb565_get_pixel_colour_offs_compensated(&tPointV, &ptParam->tOrigin. tValidRegion, pOrigin, iOrigStride, pTargetBaseCur, MaskColour, nbVecElts, correctionOffset); pTargetBaseCur += 8; vX += ((1 << 6) * 8); nbVecElts -= 8; } pTargetBase += iTargetStride; } } } __OVERRIDE_WEAK void __arm_2d_impl_rgb565_rotate_alpha( __arm_2d_param_copy_orig_t *ptParam, __arm_2d_rotate_info_t *ptInfo, uint_fast8_t chRatio) { int32_t iHeight = ptParam->use_as____arm_2d_param_copy_t.tCopySize.iHeight; int32_t iWidth = ptParam->use_as____arm_2d_param_copy_t.tCopySize.iWidth; int32_t iTargetStride = ptParam->use_as____arm_2d_param_copy_t.tTarget.iStride; uint16_t *pTargetBase = ptParam->use_as____arm_2d_param_copy_t.tTarget.pBuffer; uint16_t *pOrigin = ptParam->tOrigin.pBuffer; int32_t iOrigStride = ptParam->tOrigin.iStride; uint16_t MaskColour = ptInfo->Mask.hwColour; float fAngle = -ptInfo->fAngle; arm_2d_location_t tOffset = ptParam->use_as____arm_2d_param_copy_t.tSource.tValidRegion.tLocation; arm_2d_location_t *pCenter = &(ptInfo->tCenter); uint16_t hwRatioCompl = 256 - chRatio; q31_t invIWidth = 0x7fffffff / (iWidth - 1); arm_2d_rot_linear_regr_t regrCoefs[2]; arm_2d_location_t SrcPt = ptInfo->tDummySourceOffset; bool gatherLoadIdxOverflow; /* get regression parameters over 1st and last column */ gatherLoadIdxOverflow = __arm_2d_rotate_regression(&ptParam->use_as____arm_2d_param_copy_t.tCopySize, &SrcPt, fAngle, &tOffset, pCenter, iOrigStride, regrCoefs); /* slopes between 1st and last columns */ int32_t slopeY, slopeX; slopeY = MULTFX((regrCoefs[1].interceptY - regrCoefs[0].interceptY), invIWidth); slopeX = MULTFX((regrCoefs[1].interceptX - regrCoefs[0].interceptX), invIWidth); int32_t nrmSlopeX = 17 - __CLZ(ABS(slopeX)); int32_t nrmSlopeY = 17 - __CLZ(ABS(slopeY)); slopeX = ARSHIFT(slopeX, nrmSlopeX); slopeY = ARSHIFT(slopeY, nrmSlopeY); if (!gatherLoadIdxOverflow) { for (int32_t y = 0; y < iHeight; y++) { /* 1st column estimates */ int32_t colFirstY = qadd((regrCoefs[0].slopeY * y), regrCoefs[0].interceptY); int32_t colFirstX = qadd((regrCoefs[0].slopeX * y), regrCoefs[0].interceptX); /* Q6 conversion */ colFirstX = colFirstX >> 10; colFirstY = colFirstY >> 10; int32_t nbVecElts = iWidth; int16x8_t vX = (int16x8_t) vidupq_n_u16(0, 1); uint16_t *pTargetBaseCur = pTargetBase; /* Q9.6 coversion */ vX = vX * (1 << 6); while (nbVecElts > 0) { /* interpolation */ arm_2d_point_s16x8_t tPointV;; int16x8_t vtmp; vtmp = vqdmulhq_n_s16(vX, slopeX); vtmp = vaddq_n_s16(vqrshlq_n_s16(vtmp, nrmSlopeX), colFirstX); tPointV.X = vtmp >> 6; vtmp = vqdmulhq_n_s16(vX, slopeY); vtmp = vaddq_n_s16(vqrshlq_n_s16(vtmp, nrmSlopeY), colFirstY); tPointV.Y = vtmp >> 6; __arm_2d_impl_rgb565_get_pixel_colour_with_alpha(&tPointV, &ptParam-> tOrigin.tValidRegion, pOrigin, iOrigStride, pTargetBaseCur, MaskColour, hwRatioCompl, nbVecElts); pTargetBaseCur += 8; vX += ((1 << 6) * 8); nbVecElts -= 8; } pTargetBase += iTargetStride; } } else { /* Large image / Large origin offsets Gather load 16-bit could overflow - Y offset needs to be shifted down to avoid overflow - 16-bit gather loads base address is incremented Needs to be done in the inner loop. In the case of steep slopes, taking the minimum between the Y extrema could still generate overflows */ for (int32_t y = 0; y < iHeight; y++) { /* 1st column estimates */ int32_t colFirstY = qadd((regrCoefs[0].slopeY * y), regrCoefs[0].interceptY); int32_t colFirstX = qadd((regrCoefs[0].slopeX * y), regrCoefs[0].interceptX); /* Q6 conversion */ colFirstX = colFirstX >> 10; colFirstY = colFirstY >> 10; int32_t nbVecElts = iWidth; int16x8_t vX = (int16x8_t) vidupq_n_u16(0, 1); uint16_t *pTargetBaseCur = pTargetBase; /* Q9.6 coversion */ vX = vX * (1 << 6); while (nbVecElts > 0) { /* interpolation */ arm_2d_point_s16x8_t tPointV;; int16x8_t vtmp; vtmp = vqdmulhq_n_s16(vX, slopeX); vtmp = vaddq_n_s16(vqrshlq_n_s16(vtmp, nrmSlopeX), colFirstX); tPointV.X = vtmp >> 6; vtmp = vqdmulhq_n_s16(vX, slopeY); vtmp = vaddq_n_s16(vqrshlq_n_s16(vtmp, nrmSlopeY), colFirstY); tPointV.Y = vtmp >> 6; /* get Y minimum, subtract 1 to compensate negative X, as gather load index cannot be negative */ int16_t correctionOffset = vminvq_s16(0x7fff, tPointV.Y) - 1; __arm_2d_impl_rgb565_get_pixel_colour_with_alpha_offs_compensated (&tPointV, &ptParam->tOrigin.tValidRegion, pOrigin, iOrigStride, pTargetBaseCur, MaskColour, hwRatioCompl, nbVecElts, correctionOffset); pTargetBaseCur += 8; vX += ((1 << 6) * 8); nbVecElts -= 8; } pTargetBase += iTargetStride; } } } /* untested */ __OVERRIDE_WEAK void __arm_2d_impl_rgb888_rotate( __arm_2d_param_copy_orig_t *ptParam, __arm_2d_rotate_info_t *ptInfo) { int32_t iHeight = ptParam->use_as____arm_2d_param_copy_t.tCopySize.iHeight; int32_t iWidth = ptParam->use_as____arm_2d_param_copy_t.tCopySize.iWidth; int32_t iTargetStride = ptParam->use_as____arm_2d_param_copy_t.tTarget.iStride; uint32_t *pTargetBase = ptParam->use_as____arm_2d_param_copy_t.tTarget.pBuffer; uint32_t *pOrigin = ptParam->tOrigin.pBuffer; int32_t iOrigStride = ptParam->tOrigin.iStride; uint32_t MaskColour = ptInfo->Mask.hwColour; float fAngle = -ptInfo->fAngle; arm_2d_location_t tOffset = ptParam->use_as____arm_2d_param_copy_t.tSource.tValidRegion.tLocation; arm_2d_location_t *pCenter = &(ptInfo->tCenter); q31_t invIWidth = 0x7fffffff / (iWidth - 1); arm_2d_rot_linear_regr_t regrCoefs[2]; arm_2d_location_t SrcPt = ptInfo->tDummySourceOffset; /* get regression parameters over 1st and last column */ __arm_2d_rotate_regression(&ptParam->use_as____arm_2d_param_copy_t.tCopySize, &SrcPt, fAngle, &tOffset, pCenter, iOrigStride, regrCoefs); /* slopes between 1st and last columns */ int32_t slopeY, slopeX; slopeY = MULTFX((regrCoefs[1].interceptY - regrCoefs[0].interceptY), invIWidth); slopeX = MULTFX((regrCoefs[1].interceptX - regrCoefs[0].interceptX), invIWidth); int32_t nrmSlopeX = 17 - __CLZ(ABS(slopeX)); int32_t nrmSlopeY = 17 - __CLZ(ABS(slopeY)); slopeX = ARSHIFT(slopeX, nrmSlopeX); slopeY = ARSHIFT(slopeY, nrmSlopeY); for (int32_t y = 0; y < iHeight; y++) { /* 1st column estimates */ int32_t colFirstY = qadd((regrCoefs[0].slopeY * y), regrCoefs[0].interceptY); int32_t colFirstX = qadd((regrCoefs[0].slopeX * y), regrCoefs[0].interceptX); /* Q6 conversion */ colFirstX = colFirstX >> 10; colFirstY = colFirstY >> 10; int32_t nbVecElts = iWidth; int16x8_t vX = (int16x8_t) vidupq_n_u16(0, 1); uint32_t *pTargetBaseCur = pTargetBase; /* Q9.6 coversion */ vX = vX * (1<<6); while (nbVecElts > 0) { arm_2d_point_s16x8_t tPointV;; int16x8_t vtmp; vtmp = vqdmulhq_n_s16(vX, slopeX); vtmp = vaddq_n_s16(vqrshlq_n_s16(vtmp, nrmSlopeX), colFirstX); tPointV.X = vtmp >> 6; vtmp = vqdmulhq_n_s16(vX, slopeY); vtmp = vaddq_n_s16(vqrshlq_n_s16(vtmp, nrmSlopeY), colFirstY); tPointV.Y = vtmp >> 6; __arm_2d_impl_rgb888_get_pixel_colour_mve(&tPointV, &ptParam->tOrigin.tValidRegion, pOrigin, iOrigStride, pTargetBase, MaskColour, nbVecElts); pTargetBaseCur += 8; vX += ((1<<6) * 8); nbVecElts -= 8; } pTargetBase += iTargetStride; } } /* untested */ __OVERRIDE_WEAK void __arm_2d_impl_rgb888_rotate_alpha( __arm_2d_param_copy_orig_t *ptParam, __arm_2d_rotate_info_t *ptInfo, uint_fast8_t chRatio) { int32_t iHeight = ptParam->use_as____arm_2d_param_copy_t.tCopySize.iHeight; int32_t iWidth = ptParam->use_as____arm_2d_param_copy_t.tCopySize.iWidth; int32_t iTargetStride = ptParam->use_as____arm_2d_param_copy_t.tTarget.iStride; uint32_t *pTargetBase = ptParam->use_as____arm_2d_param_copy_t.tTarget.pBuffer; uint32_t *pOrigin = ptParam->tOrigin.pBuffer; int32_t iOrigStride = ptParam->tOrigin.iStride; uint32_t MaskColour = ptInfo->Mask.hwColour; float fAngle = -ptInfo->fAngle; arm_2d_location_t tOffset = ptParam->use_as____arm_2d_param_copy_t.tSource.tValidRegion.tLocation; uint16_t wRatioCompl = 256 - chRatio; arm_2d_location_t *pCenter = &(ptInfo->tCenter); q31_t invIWidth = 0x7fffffff / (iWidth - 1); arm_2d_rot_linear_regr_t regrCoefs[2]; arm_2d_location_t SrcPt = ptInfo->tDummySourceOffset; /* get regression parameters over 1st and last column */ __arm_2d_rotate_regression(&ptParam->use_as____arm_2d_param_copy_t.tCopySize, &SrcPt, fAngle, &tOffset, pCenter, iOrigStride, regrCoefs); /* slopes between 1st and last columns */ int32_t slopeY, slopeX; slopeY = MULTFX((regrCoefs[1].interceptY - regrCoefs[0].interceptY), invIWidth); slopeX = MULTFX((regrCoefs[1].interceptX - regrCoefs[0].interceptX), invIWidth); int32_t nrmSlopeX = 17 - __CLZ(ABS(slopeX)); int32_t nrmSlopeY = 17 - __CLZ(ABS(slopeY)); slopeX = ARSHIFT(slopeX, nrmSlopeX); slopeY = ARSHIFT(slopeY, nrmSlopeY); for (int32_t y = 0; y < iHeight; y++) { /* 1st column estimates */ int32_t colFirstY = qadd((regrCoefs[0].slopeY * y), regrCoefs[0].interceptY); int32_t colFirstX = qadd((regrCoefs[0].slopeX * y), regrCoefs[0].interceptX); /* Q6 conversion */ colFirstX = colFirstX >> 10; colFirstY = colFirstY >> 10; int32_t nbVecElts = iWidth; int16x8_t vX = (int16x8_t) vidupq_n_u16(0, 1); uint32_t *pTargetBaseCur = pTargetBase; /* Q9.6 coversion */ vX = vX * (1<<6); while (nbVecElts > 0) { arm_2d_point_s16x8_t tPointV;; int16x8_t vtmp; vtmp = vqdmulhq_n_s16(vX, slopeX); vtmp = vaddq_n_s16(vqrshlq_n_s16(vtmp, nrmSlopeX), colFirstX); tPointV.X = vtmp >> 6; vtmp = vqdmulhq_n_s16(vX, slopeY); vtmp = vaddq_n_s16(vqrshlq_n_s16(vtmp, nrmSlopeY), colFirstY); tPointV.Y = vtmp >> 6; __arm_2d_impl_rgb888_get_pixel_colour_with_alpha_mve(&tPointV, &ptParam-> tOrigin.tValidRegion, pOrigin, iOrigStride, pTargetBase, MaskColour, wRatioCompl, nbVecElts); pTargetBaseCur += 8; vX += ((1<<6) * 8); nbVecElts -= 8; } pTargetBase += iTargetStride; } } #endif /* rgb16_draw_pattern helpers */ /* * enable to pick gather load offset based on initial offset * e.g. if iOffset = 3 * will get {0, 0, 0, 0, 0, 1, 1, 1} */ static uint16_t __rgb16_draw_pattern_src_incr[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }; /* * enable to pick vector bitmask based on initial offset * e.g. if iOffset = 3 * will get {8, 16, 32, 64, 128, 1, 2, 4} */ static uint16_t __rgb16_draw_pattern_src_bitmask[16] = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, }; /* rgb32_draw_pattern helpers */ static uint32_t __rgb32_draw_pattern_src_incr[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }; static uint32_t __rgb32_draw_pattern_src_bitmask[16] = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, }; __OVERRIDE_WEAK void __arm_2d_impl_rgb16_draw_pattern_fg_only(uint8_t *__RESTRICT pchSourceBase, int32_t iOffset, int16_t iSourceStride, uint16_t *__RESTRICT pTargetBase, int16_t iTargetStride, arm_2d_size_t *__RESTRICT ptCopySize, uint16_t hwForeColour) { //! get in byte offset iOffset &= 0x07; iSourceStride = (iSourceStride + 7) & ~0x07; /* deduces offset vector from iOffset for gather loading */ uint16x8_t offS = vld1q(__rgb16_draw_pattern_src_incr + iOffset); /* deduces bitmask vector with wrap from iOffset */ uint16x8_t vBitMask = vld1q(__rgb16_draw_pattern_src_bitmask + iOffset); if (ptCopySize->iWidth <= 8) { /* small width specialization */ /* no inner loop */ mve_pred16_t p = vctp16q(ptCopySize->iWidth); for (int32_t y = 0; y < ptCopySize->iHeight; y++) { uint16x8_t vchSrc = vldrbq_gather_offset_u16(pchSourceBase, offS); uint16x8_t vTarg = vld1q(pTargetBase); vchSrc = vchSrc & vBitMask; vTarg = vdupq_m_n_u16(vTarg, hwForeColour, vcmpneq_n_u16(vchSrc, 0)); vst1q_p(pTargetBase, vTarg, p); pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } else { for (int32_t y = 0; y < ptCopySize->iHeight; y++) { int32_t cnt = ptCopySize->iWidth; uint8_t *pchSourceBaseCur = pchSourceBase; uint16_t *pTargetBaseCur = pTargetBase; while (cnt > 0) { mve_pred16_t p = vctp16q(cnt); uint16x8_t vchSrc = vldrbq_gather_offset_z_u16(pchSourceBaseCur, offS, p); uint16x8_t vTarg = vld1q_z(pTargetBaseCur, p); vchSrc = vandq_x(vchSrc, vBitMask, p); vTarg = vdupq_m_n_u16(vTarg, hwForeColour, vcmpneq_m_n_u16(vchSrc, 0, p)); vst1q_p(pTargetBaseCur, vTarg, p); pTargetBaseCur += 8; pchSourceBaseCur += 1; cnt -= 8; } pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } } __OVERRIDE_WEAK void __arm_2d_impl_rgb16_draw_pattern_no_bg_comp(uint8_t * __RESTRICT pchSourceBase, int32_t iOffset, int16_t iSourceStride, uint16_t * __RESTRICT pTargetBase, int16_t iTargetStride, arm_2d_size_t * __RESTRICT ptCopySize) { //! get in byte offset iOffset &= 0x07; iSourceStride = (iSourceStride + 7) & ~0x07; /* deduces offset vector from iOffset for gather loading */ uint16x8_t offS = vld1q(__rgb16_draw_pattern_src_incr + iOffset); /* deduces bitmask vector with wrap from iOffset */ uint16x8_t vBitMask = vld1q(__rgb16_draw_pattern_src_bitmask + iOffset); if (ptCopySize->iWidth <= 8) { /* small width specialization */ /* no inner loop */ mve_pred16_t p = vctp16q(ptCopySize->iWidth); for (int32_t y = 0; y < ptCopySize->iHeight; y++) { uint16x8_t vchSrc = vldrbq_gather_offset_u16(pchSourceBase, offS); uint16x8_t vTarg = vld1q(pTargetBase); vchSrc = vchSrc & vBitMask; vTarg = vpselq(~vTarg, vTarg, vcmpneq_n_u16(vchSrc, 0));//vTarg = vpselq(vTarg, ~vTarg, vcmpneq_n_u16(vchSrc, 0)); vst1q_p(pTargetBase, vTarg, p); pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } else { for (int32_t y = 0; y < ptCopySize->iHeight; y++) { int32_t cnt = ptCopySize->iWidth; uint8_t *pchSourceBaseCur = pchSourceBase; uint16_t *pTargetBaseCur = pTargetBase; while (cnt > 0) { mve_pred16_t p = vctp16q(cnt); uint16x8_t vchSrc = vldrbq_gather_offset_z_u16(pchSourceBaseCur, offS, p); uint16x8_t vTarg = vld1q_z(pTargetBaseCur, p); vchSrc = vandq_x(vchSrc, vBitMask, p); vTarg = vpselq(vmvnq_x(vTarg, p), vTarg, vcmpneq_m_n_u16(vchSrc, 0, p));//vTarg = vpselq(vTarg, vmvnq_x(vTarg, p), vcmpneq_m_n_u16(vchSrc, 0, p)); vst1q_p(pTargetBaseCur, vTarg, p); pTargetBaseCur += 8; pchSourceBaseCur += 1; cnt -= 8; } pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } } __OVERRIDE_WEAK void __arm_2d_impl_rgb16_draw_pattern_bg_only(uint8_t *__RESTRICT pchSourceBase, int32_t iOffset, int16_t iSourceStride, uint16_t *__RESTRICT pTargetBase, int16_t iTargetStride, arm_2d_size_t *__RESTRICT ptCopySize, uint16_t hwBackColour) { //! get in byte offset iOffset &= 0x07; iSourceStride = (iSourceStride + 7) & ~0x07; /* deduces offset vector from iOffset for gather loading */ uint16x8_t offS = vld1q(__rgb16_draw_pattern_src_incr + iOffset); /* deduces bitmask vector with wrap from iOffset */ uint16x8_t vBitMask = vld1q(__rgb16_draw_pattern_src_bitmask + iOffset); if (ptCopySize->iWidth <= 8) { /* small width specialization */ /* no inner loop */ mve_pred16_t p = vctp16q(ptCopySize->iWidth); for (int32_t y = 0; y < ptCopySize->iHeight; y++) { uint16x8_t vchSrc = vldrbq_gather_offset_u16(pchSourceBase, offS); uint16x8_t vTarg = vld1q(pTargetBase); vchSrc = vchSrc & vBitMask; vTarg = vdupq_m_n_u16(vTarg, hwBackColour, vcmpeqq_n_u16(vchSrc, 0)); vst1q_p(pTargetBase, vTarg, p); pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } else { for (int32_t y = 0; y < ptCopySize->iHeight; y++) { int32_t cnt = ptCopySize->iWidth; uint8_t *pchSourceBaseCur = pchSourceBase; uint16_t *pTargetBaseCur = pTargetBase; while (cnt > 0) { mve_pred16_t p = vctp16q(cnt); uint16x8_t vchSrc = vldrbq_gather_offset_z_u16(pchSourceBaseCur, offS, p); uint16x8_t vTarg = vld1q_z(pTargetBaseCur, p); vchSrc = vandq_x(vchSrc, vBitMask, p); vTarg = vdupq_m_n_u16(vTarg, hwBackColour, vcmpeqq_m_n_u16(vchSrc, 0, p)); vst1q_p(pTargetBaseCur, vTarg, p); pTargetBaseCur += 8; pchSourceBaseCur += 1; cnt -= 8; } pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } } __OVERRIDE_WEAK void __arm_2d_impl_rgb16_draw_pattern_bg_fg(uint8_t *__RESTRICT pchSourceBase, int32_t iOffset, int16_t iSourceStride, uint16_t *__RESTRICT pTargetBase, int16_t iTargetStride, arm_2d_size_t *__RESTRICT ptCopySize, uint16_t hwForeColour, uint16_t hwBackColour) { //! get in byte offset iOffset &= 0x07; iSourceStride = (iSourceStride + 7) & ~0x07; /* deduces offset vector from iOffset for gather loading */ uint16x8_t offS = vld1q(__rgb16_draw_pattern_src_incr + iOffset); /* deduces bitmask vector with wrap from iOffset */ uint16x8_t vBitMask = vld1q(__rgb16_draw_pattern_src_bitmask + iOffset); uint16x8_t vFgColor = vdupq_n_u16(hwForeColour); uint16x8_t vBgColor = vdupq_n_u16(hwBackColour); if (ptCopySize->iWidth <= 8) { /* small width specialization */ /* no inner loop */ mve_pred16_t p = vctp16q(ptCopySize->iWidth); for (int32_t y = 0; y < ptCopySize->iHeight; y++) { uint16x8_t vchSrc = vldrbq_gather_offset_u16(pchSourceBase, offS); uint16x8_t vTarg; vchSrc = vchSrc & vBitMask; vTarg = vpselq(vFgColor, vBgColor, vcmpneq_n_u16(vchSrc, 0)); vst1q_p(pTargetBase, vTarg, p); pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } else { for (int32_t y = 0; y < ptCopySize->iHeight; y++) { int32_t cnt = ptCopySize->iWidth; uint8_t *pchSourceBaseCur = pchSourceBase; uint16_t *pTargetBaseCur = pTargetBase; while (cnt > 0) { mve_pred16_t p = vctp16q(cnt); uint16x8_t vchSrc = vldrbq_gather_offset_z_u16(pchSourceBaseCur, offS, p); uint16x8_t vTarg; vchSrc = vandq_x(vchSrc, vBitMask, p); vTarg = vpselq(vFgColor, vBgColor, vcmpneq_m_n_u16(vchSrc, 0, p)); vst1q_p(pTargetBaseCur, vTarg, p); pTargetBaseCur += 8; pchSourceBaseCur += 1; cnt -= 8; } pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } } __OVERRIDE_WEAK void __arm_2d_impl_rgb16_draw_pattern_bg_comp(uint8_t *__RESTRICT pchSourceBase, int32_t iOffset, int16_t iSourceStride, uint16_t *__RESTRICT pTargetBase, int16_t iTargetStride, arm_2d_size_t *__RESTRICT ptCopySize, uint16_t hwBackColour) { //! get in byte offset iOffset &= 0x07; iSourceStride = (iSourceStride + 7) & ~0x07; /* deduces offset vector from iOffset for gather loading */ uint16x8_t offS = vld1q(__rgb16_draw_pattern_src_incr + iOffset); /* deduces bitmask vector with wrap from iOffset */ uint16x8_t vBitMask = vld1q(__rgb16_draw_pattern_src_bitmask + iOffset); uint16x8_t vBgColor = vdupq_n_u16(hwBackColour); if (ptCopySize->iWidth <= 8) { /* small width specialization */ /* no inner loop */ mve_pred16_t p = vctp16q(ptCopySize->iWidth); for (int32_t y = 0; y < ptCopySize->iHeight; y++) { uint16x8_t vchSrc = vldrbq_gather_offset_u16(pchSourceBase, offS); uint16x8_t vTarg = vld1q(pTargetBase); /* if ((*pchSrc) & chBitMask) *pTarget = ~(*pTarget); else *pTarget = hwBackColour; */ vchSrc = vchSrc & vBitMask; vTarg = vpselq(vmvnq(vTarg), vBgColor, vcmpneq_n_u16(vchSrc, 0)); vst1q_p(pTargetBase, vTarg, p); pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } else { for (int32_t y = 0; y < ptCopySize->iHeight; y++) { int32_t cnt = ptCopySize->iWidth; uint8_t *pchSourceBaseCur = pchSourceBase; uint16_t *pTargetBaseCur = pTargetBase; while (cnt > 0) { mve_pred16_t p = vctp16q(cnt); uint16x8_t vchSrc = vldrbq_gather_offset_z_u16(pchSourceBaseCur, offS, p); uint16x8_t vTarg = vld1q_z(pTargetBase, p); vchSrc = vandq_x(vchSrc, vBitMask, p); vTarg = vpselq(vmvnq_x(vTarg, p), vBgColor, vcmpneq_m_n_u16(vchSrc, 0, p)); vst1q_p(pTargetBaseCur, vTarg, p); pTargetBaseCur += 8; pchSourceBaseCur += 1; cnt -= 8; } pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } } __OVERRIDE_WEAK void __arm_2d_impl_rgb32_draw_pattern_fg_only(uint8_t *__RESTRICT pchSourceBase, int32_t iOffset, int16_t iSourceStride, uint32_t *__RESTRICT pTargetBase, int16_t iTargetStride, arm_2d_size_t *__RESTRICT ptCopySize, uint32_t hwForeColour) { //! get in byte offset iOffset &= 0x07; iSourceStride = (iSourceStride + 7) & ~0x07; /* deduces offset vector from iOffset for gather loading */ /* hold 8 contiguous values into 2 32-bit vector pair */ uint32x4_t offSLo = vld1q(__rgb32_draw_pattern_src_incr + iOffset); uint32x4_t offSHi = vld1q(__rgb32_draw_pattern_src_incr + iOffset + 4); /* deduces bitmask vector with wrap from iOffset */ /* hold 8 contiguous values into 2 32-bit vector pair */ uint32x4_t vBitMaskLo = vld1q(__rgb32_draw_pattern_src_bitmask + iOffset); uint32x4_t vBitMaskHi = vld1q(__rgb32_draw_pattern_src_bitmask + iOffset + 4); if (ptCopySize->iWidth <= 4) { /* very tall width case */ /* only bottom parts of gather load and bitmask needed */ /* no inner loop */ for (int32_t y = 0; y < ptCopySize->iHeight; y++) { mve_pred16_t p = vctp32q(ptCopySize->iWidth); uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBase, offSLo); uint32x4_t vTarg = vld1q(pTargetBase); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vdupq_m_n_u32(vTarg, hwForeColour, vcmpneq_n_u32(vchSrc, 0)); vst1q_p(pTargetBase, vTarg, p); pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } else if (ptCopySize->iWidth <= 8) { /* bottom and partial upper parts of gather load and bitmask needed */ /* no inner loop */ for (int32_t y = 0; y < ptCopySize->iHeight; y++) { mve_pred16_t p = vctp32q(ptCopySize->iWidth - 4); uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBase, offSLo); uint32x4_t vTarg = vld1q(pTargetBase); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vdupq_m_n_u32(vTarg, hwForeColour, vcmpneq_n_u32(vchSrc, 0)); vst1q(pTargetBase, vTarg); vchSrc = vldrbq_gather_offset_u32(pchSourceBase, offSHi); vTarg = vld1q(pTargetBase + 4); vchSrc = vandq(vchSrc, vBitMaskHi); vTarg = vdupq_m_n_u32(vTarg, hwForeColour, vcmpneq_n_u32(vchSrc, 0)); vst1q_p(pTargetBase + 4, vTarg, p); pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } else { /* generic case */ for (int32_t y = 0; y < ptCopySize->iHeight; y++) { int32_t cnt = ptCopySize->iWidth; uint8_t *pchSourceBaseCur = pchSourceBase; uint32_t *pTargetBaseCur = pTargetBase; while (cnt >= 8) { uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSLo); uint32x4_t vTarg = vld1q(pTargetBaseCur); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vdupq_m_n_u32(vTarg, hwForeColour, vcmpneq_n_u32(vchSrc, 0)); vst1q(pTargetBaseCur, vTarg); pTargetBaseCur += 4; vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSHi); vTarg = vld1q(pTargetBaseCur); vchSrc = vandq(vchSrc, vBitMaskHi); vTarg = vdupq_m_n_u32(vTarg, hwForeColour, vcmpneq_n_u32(vchSrc, 0)); vst1q(pTargetBaseCur, vTarg); pTargetBaseCur += 4; pchSourceBaseCur += 1; cnt -= 8; } /* tail */ if (cnt > 4) { /* bottom part + upper residual parts */ uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSLo); uint32x4_t vTarg = vld1q(pTargetBaseCur); cnt -= 4; mve_pred16_t p = vctp32q(cnt); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vdupq_m_n_u32(vTarg, hwForeColour, vcmpneq_n_u32(vchSrc, 0)); vst1q(pTargetBaseCur, vTarg); pTargetBaseCur += 4; vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSHi); vTarg = vld1q(pTargetBaseCur); vchSrc = vandq(vchSrc, vBitMaskHi); vTarg = vdupq_m_n_u32(vTarg, hwForeColour, vcmpneq_n_u32(vchSrc, 0)); vst1q_p(pTargetBaseCur, vTarg, p); } else if (cnt > 0) { /* bottom part residual */ uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSLo); uint32x4_t vTarg = vld1q(pTargetBaseCur); mve_pred16_t p = vctp32q(cnt); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vdupq_m_n_u32(vTarg, hwForeColour, vcmpneq_n_u32(vchSrc, 0)); vst1q_p(pTargetBaseCur, vTarg, p); } pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } } __OVERRIDE_WEAK void __arm_2d_impl_rgb32_draw_pattern_no_bg_comp(uint8_t *__RESTRICT pchSourceBase, int32_t iOffset, int16_t iSourceStride, uint32_t *__RESTRICT pTargetBase, int16_t iTargetStride, arm_2d_size_t *__RESTRICT ptCopySize) { //! get in byte offset iOffset &= 0x07; iSourceStride = (iSourceStride + 7) & ~0x07; /* deduces offset vector from iOffset for gather loading */ /* hold 8 contiguous values into 2 32-bit vector pair */ uint32x4_t offSLo = vld1q(__rgb32_draw_pattern_src_incr + iOffset); uint32x4_t offSHi = vld1q(__rgb32_draw_pattern_src_incr + iOffset + 4); /* deduces bitmask vector with wrap from iOffset */ /* hold 8 contiguous values into 2 32-bit vector pair */ uint32x4_t vBitMaskLo = vld1q(__rgb32_draw_pattern_src_bitmask + iOffset); uint32x4_t vBitMaskHi = vld1q(__rgb32_draw_pattern_src_bitmask + iOffset + 4); if (ptCopySize->iWidth <= 4) { /* very tall width case */ /* only bottom parts of gather load and bitmask needed */ /* no inner loop */ for (int32_t y = 0; y < ptCopySize->iHeight; y++) { mve_pred16_t p = vctp32q(ptCopySize->iWidth); uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBase, offSLo); uint32x4_t vTarg = vld1q(pTargetBase); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vpselq(vTarg, ~vTarg, vcmpneq_n_u32(vchSrc, 0)); vst1q_p(pTargetBase, vTarg, p); pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } else if (ptCopySize->iWidth <= 8) { /* bottom and partial upper parts of gather load and bitmask needed */ /* no inner loop */ for (int32_t y = 0; y < ptCopySize->iHeight; y++) { mve_pred16_t p = vctp32q(ptCopySize->iWidth - 4); uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBase, offSLo); uint32x4_t vTarg = vld1q(pTargetBase); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vpselq(vTarg, ~vTarg, vcmpneq_n_u32(vchSrc, 0)); vst1q(pTargetBase, vTarg); vchSrc = vldrbq_gather_offset_u32(pchSourceBase, offSHi); vTarg = vld1q(pTargetBase + 4); vchSrc = vandq(vchSrc, vBitMaskHi); vTarg = vpselq(vTarg, ~vTarg, vcmpneq_n_u32(vchSrc, 0)); vst1q_p(pTargetBase + 4, vTarg, p); pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } else { /* generic case */ for (int32_t y = 0; y < ptCopySize->iHeight; y++) { int32_t cnt = ptCopySize->iWidth; uint8_t *pchSourceBaseCur = pchSourceBase; uint32_t *pTargetBaseCur = pTargetBase; while (cnt >= 8) { uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSLo); uint32x4_t vTarg = vld1q(pTargetBaseCur); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vpselq(vTarg, ~vTarg, vcmpneq_n_u32(vchSrc, 0)); vst1q(pTargetBaseCur, vTarg); pTargetBaseCur += 4; vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSHi); vTarg = vld1q(pTargetBaseCur); vchSrc = vandq(vchSrc, vBitMaskHi); vTarg = vpselq(vTarg, ~vTarg, vcmpneq_n_u32(vchSrc, 0)); vst1q(pTargetBaseCur, vTarg); pTargetBaseCur += 4; pchSourceBaseCur += 1; cnt -= 8; } /* tail */ if (cnt > 4) { /* bottom part + upper residual parts */ uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSLo); uint32x4_t vTarg = vld1q(pTargetBaseCur); cnt -= 4; mve_pred16_t p = vctp32q(cnt); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vpselq(vTarg, ~vTarg, vcmpneq_n_u32(vchSrc, 0)); vst1q(pTargetBaseCur, vTarg); pTargetBaseCur += 4; vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSHi); vTarg = vld1q(pTargetBaseCur); vchSrc = vandq(vchSrc, vBitMaskHi); vTarg = vpselq(vTarg, ~vTarg, vcmpneq_n_u32(vchSrc, 0)); vst1q_p(pTargetBaseCur, vTarg, p); } else if (cnt > 0) { /* bottom part residual */ uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSLo); uint32x4_t vTarg = vld1q(pTargetBaseCur); mve_pred16_t p = vctp32q(cnt); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vpselq(vTarg, ~vTarg, vcmpneq_n_u32(vchSrc, 0)); vst1q_p(pTargetBaseCur, vTarg, p); } pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } } __OVERRIDE_WEAK void __arm_2d_impl_rgb32_draw_pattern_bg_only(uint8_t *__RESTRICT pchSourceBase, int32_t iOffset, int16_t iSourceStride, uint32_t *__RESTRICT pTargetBase, int16_t iTargetStride, arm_2d_size_t *__RESTRICT ptCopySize, uint32_t hwBackColour) { //! get in byte offset iOffset &= 0x07; iSourceStride = (iSourceStride + 7) & ~0x07; /* deduces offset vector from iOffset for gather loading */ /* hold 8 contiguous values into 2 32-bit vector pair */ uint32x4_t offSLo = vld1q(__rgb32_draw_pattern_src_incr + iOffset); uint32x4_t offSHi = vld1q(__rgb32_draw_pattern_src_incr + iOffset + 4); /* deduces bitmask vector with wrap from iOffset */ /* hold 8 contiguous values into 2 32-bit vector pair */ uint32x4_t vBitMaskLo = vld1q(__rgb32_draw_pattern_src_bitmask + iOffset); uint32x4_t vBitMaskHi = vld1q(__rgb32_draw_pattern_src_bitmask + iOffset + 4); if (ptCopySize->iWidth <= 4) { /* very tall width case */ /* only bottom parts of gather load and bitmask needed */ /* no inner loop */ for (int32_t y = 0; y < ptCopySize->iHeight; y++) { mve_pred16_t p = vctp32q(ptCopySize->iWidth); uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBase, offSLo); uint32x4_t vTarg = vld1q(pTargetBase); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vdupq_m_n_u32(vTarg, hwBackColour, vcmpeqq_n_u32(vchSrc, 0)); vst1q_p(pTargetBase, vTarg, p); pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } else if (ptCopySize->iWidth <= 8) { /* bottom and partial upper parts of gather load and bitmask needed */ /* no inner loop */ for (int32_t y = 0; y < ptCopySize->iHeight; y++) { mve_pred16_t p = vctp32q(ptCopySize->iWidth - 4); uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBase, offSLo); uint32x4_t vTarg = vld1q(pTargetBase); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vdupq_m_n_u32(vTarg, hwBackColour, vcmpeqq_n_u32(vchSrc, 0)); vst1q(pTargetBase, vTarg); vchSrc = vldrbq_gather_offset_u32(pchSourceBase, offSHi); vTarg = vld1q(pTargetBase + 4); vchSrc = vandq(vchSrc, vBitMaskHi); vTarg = vdupq_m_n_u32(vTarg, hwBackColour, vcmpeqq_n_u32(vchSrc, 0)); vst1q_p(pTargetBase + 4, vTarg, p); pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } else { /* generic case */ for (int32_t y = 0; y < ptCopySize->iHeight; y++) { int32_t cnt = ptCopySize->iWidth; uint8_t *pchSourceBaseCur = pchSourceBase; uint32_t *pTargetBaseCur = pTargetBase; while (cnt >= 8) { uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSLo); uint32x4_t vTarg = vld1q(pTargetBaseCur); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vdupq_m_n_u32(vTarg, hwBackColour, vcmpeqq_n_u32(vchSrc, 0)); vst1q(pTargetBaseCur, vTarg); pTargetBaseCur += 4; vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSHi); vTarg = vld1q(pTargetBaseCur); vchSrc = vandq(vchSrc, vBitMaskHi); vTarg = vdupq_m_n_u32(vTarg, hwBackColour, vcmpeqq_n_u32(vchSrc, 0)); vst1q(pTargetBaseCur, vTarg); pTargetBaseCur += 4; pchSourceBaseCur += 1; cnt -= 8; } /* tail */ if (cnt > 4) { /* bottom part + upper residual parts */ uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSLo); uint32x4_t vTarg = vld1q(pTargetBaseCur); cnt -= 4; mve_pred16_t p = vctp32q(cnt); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vdupq_m_n_u32(vTarg, hwBackColour, vcmpeqq_n_u32(vchSrc, 0)); vst1q(pTargetBaseCur, vTarg); pTargetBaseCur += 4; vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSHi); vTarg = vld1q(pTargetBaseCur); vchSrc = vandq(vchSrc, vBitMaskHi); vTarg = vdupq_m_n_u32(vTarg, hwBackColour, vcmpeqq_n_u32(vchSrc, 0)); vst1q_p(pTargetBaseCur, vTarg, p); } else if (cnt > 0) { /* bottom part residual */ uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSLo); uint32x4_t vTarg = vld1q(pTargetBaseCur); mve_pred16_t p = vctp32q(cnt); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vdupq_m_n_u32(vTarg, hwBackColour, vcmpeqq_n_u32(vchSrc, 0)); vst1q_p(pTargetBaseCur, vTarg, p); } pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } } __OVERRIDE_WEAK void __arm_2d_impl_rgb32_draw_pattern_bg_fg(uint8_t *__RESTRICT pchSourceBase, int32_t iOffset, int16_t iSourceStride, uint32_t *__RESTRICT pTargetBase, int16_t iTargetStride, arm_2d_size_t *__RESTRICT ptCopySize, uint32_t hwForeColour, uint32_t hwBackColour) { //! get in byte offset iOffset &= 0x07; iSourceStride = (iSourceStride + 7) & ~0x07; /* deduces offset vector from iOffset for gather loading */ /* hold 8 contiguous values into 2 32-bit vector pair */ uint32x4_t offSLo = vld1q(__rgb32_draw_pattern_src_incr + iOffset); uint32x4_t offSHi = vld1q(__rgb32_draw_pattern_src_incr + iOffset + 4); /* deduces bitmask vector with wrap from iOffset */ /* hold 8 contiguous values into 2 32-bit vector pair */ uint32x4_t vBitMaskLo = vld1q(__rgb32_draw_pattern_src_bitmask + iOffset); uint32x4_t vBitMaskHi = vld1q(__rgb32_draw_pattern_src_bitmask + iOffset + 4); uint32x4_t vFgColor = vdupq_n_u32(hwForeColour); uint32x4_t vBgColor = vdupq_n_u32(hwBackColour); if (ptCopySize->iWidth <= 4) { /* very tall width case */ /* only bottom parts of gather load and bitmask needed */ /* no inner loop */ for (int32_t y = 0; y < ptCopySize->iHeight; y++) { mve_pred16_t p = vctp32q(ptCopySize->iWidth); uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBase, offSLo); uint32x4_t vTarg; vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vpselq(vFgColor, vBgColor, vcmpneq_n_u32(vchSrc, 0)); vst1q_p(pTargetBase, vTarg, p); pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } else if (ptCopySize->iWidth <= 8) { /* bottom and partial upper parts of gather load and bitmask needed */ /* no inner loop */ for (int32_t y = 0; y < ptCopySize->iHeight; y++) { mve_pred16_t p = vctp32q(ptCopySize->iWidth - 4); uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBase, offSLo); uint32x4_t vTarg; vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vpselq(vFgColor, vBgColor, vcmpneq_n_u32(vchSrc, 0)); vst1q(pTargetBase, vTarg); vchSrc = vldrbq_gather_offset_u32(pchSourceBase, offSHi); vchSrc = vandq(vchSrc, vBitMaskHi); vTarg = vpselq(vFgColor, vBgColor, vcmpneq_n_u32(vchSrc, 0)); vst1q_p(pTargetBase + 4, vTarg, p); pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } else { /* generic case */ for (int32_t y = 0; y < ptCopySize->iHeight; y++) { int32_t cnt = ptCopySize->iWidth; uint8_t *pchSourceBaseCur = pchSourceBase; uint32_t *pTargetBaseCur = pTargetBase; while (cnt >= 8) { uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSLo); uint32x4_t vTarg; vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vpselq(vFgColor, vBgColor, vcmpneq_n_u32(vchSrc, 0)); vst1q(pTargetBaseCur, vTarg); pTargetBaseCur += 4; vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSHi); vchSrc = vandq(vchSrc, vBitMaskHi); vTarg = vpselq(vFgColor, vBgColor, vcmpneq_n_u32(vchSrc, 0)); vst1q(pTargetBaseCur, vTarg); pTargetBaseCur += 4; pchSourceBaseCur += 1; cnt -= 8; } /* tail */ if (cnt > 4) { /* bottom part + upper residual parts */ uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSLo); uint32x4_t vTarg; cnt -= 4; mve_pred16_t p = vctp32q(cnt); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vpselq(vFgColor, vBgColor, vcmpneq_n_u32(vchSrc, 0)); vst1q(pTargetBaseCur, vTarg); pTargetBaseCur += 4; vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSHi); vchSrc = vandq(vchSrc, vBitMaskHi); vTarg = vpselq(vFgColor, vBgColor, vcmpneq_n_u32(vchSrc, 0)); vst1q_p(pTargetBaseCur, vTarg, p); } else if (cnt > 0) { /* bottom part residual */ uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSLo); uint32x4_t vTarg; mve_pred16_t p = vctp32q(cnt); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vpselq(vFgColor, vBgColor, vcmpneq_n_u32(vchSrc, 0)); vst1q_p(pTargetBaseCur, vTarg, p); } pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } } __OVERRIDE_WEAK void __arm_2d_impl_rgb32_draw_pattern_bg_comp(uint8_t *__RESTRICT pchSourceBase, int32_t iOffset, int16_t iSourceStride, uint32_t *__RESTRICT pTargetBase, int16_t iTargetStride, arm_2d_size_t *__RESTRICT ptCopySize, uint32_t hwBackColour) { //! get in byte offset iOffset &= 0x07; iSourceStride = (iSourceStride + 7) & ~0x07; /* deduces offset vector from iOffset for gather loading */ /* hold 8 contiguous values into 2 32-bit vector pair */ uint32x4_t offSLo = vld1q(__rgb32_draw_pattern_src_incr + iOffset); uint32x4_t offSHi = vld1q(__rgb32_draw_pattern_src_incr + iOffset + 4); /* deduces bitmask vector with wrap from iOffset */ /* hold 8 contiguous values into 2 32-bit vector pair */ uint32x4_t vBitMaskLo = vld1q(__rgb32_draw_pattern_src_bitmask + iOffset); uint32x4_t vBitMaskHi = vld1q(__rgb32_draw_pattern_src_bitmask + iOffset + 4); uint32x4_t vBgColor = vdupq_n_u32(hwBackColour); if (ptCopySize->iWidth <= 4) { /* very tall width case */ /* only bottom parts of gather load and bitmask needed */ /* no inner loop */ for (int32_t y = 0; y < ptCopySize->iHeight; y++) { mve_pred16_t p = vctp32q(ptCopySize->iWidth); uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBase, offSLo); uint32x4_t vTarg = vld1q(pTargetBase); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vpselq(vmvnq(vTarg), vBgColor, vcmpneq_n_u32(vchSrc, 0)); vst1q_p(pTargetBase, vTarg, p); pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } else if (ptCopySize->iWidth <= 8) { /* bottom and partial upper parts of gather load and bitmask needed */ /* no inner loop */ for (int32_t y = 0; y < ptCopySize->iHeight; y++) { mve_pred16_t p = vctp32q(ptCopySize->iWidth - 4); uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBase, offSLo); uint32x4_t vTarg = vld1q(pTargetBase); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vpselq(vmvnq(vTarg), vBgColor, vcmpneq_n_u32(vchSrc, 0)); vst1q(pTargetBase, vTarg); vchSrc = vldrbq_gather_offset_u32(pchSourceBase, offSHi); vTarg = vld1q(pTargetBase + 4); vchSrc = vandq(vchSrc, vBitMaskHi); vTarg = vpselq(vmvnq(vTarg), vBgColor, vcmpneq_n_u32(vchSrc, 0)); vst1q_p(pTargetBase + 4, vTarg, p); pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } else { /* generic case */ for (int32_t y = 0; y < ptCopySize->iHeight; y++) { int32_t cnt = ptCopySize->iWidth; uint8_t *pchSourceBaseCur = pchSourceBase; uint32_t *pTargetBaseCur = pTargetBase; while (cnt >= 8) { uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSLo); uint32x4_t vTarg = vld1q(pTargetBaseCur); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vpselq(vmvnq(vTarg), vBgColor, vcmpneq_n_u32(vchSrc, 0)); vst1q(pTargetBaseCur, vTarg); pTargetBaseCur += 4; vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSHi); vTarg = vld1q(pTargetBaseCur); vchSrc = vandq(vchSrc, vBitMaskHi); vTarg = vpselq(vmvnq(vTarg), vBgColor, vcmpneq_n_u32(vchSrc, 0)); vst1q(pTargetBaseCur, vTarg); pTargetBaseCur += 4; pchSourceBaseCur += 1; cnt -= 8; } /* tail */ if (cnt > 4) { /* bottom part + upper residual parts */ uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSLo); uint32x4_t vTarg = vld1q(pTargetBaseCur); cnt -= 4; mve_pred16_t p = vctp32q(cnt); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vpselq(vmvnq(vTarg), vBgColor, vcmpneq_n_u32(vchSrc, 0)); vst1q(pTargetBaseCur, vTarg); pTargetBaseCur += 4; vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSHi); vTarg = vld1q(pTargetBaseCur); vchSrc = vandq(vchSrc, vBitMaskHi); vTarg = vpselq(vmvnq(vTarg), vBgColor, vcmpneq_n_u32(vchSrc, 0)); vst1q_p(pTargetBaseCur, vTarg, p); } else if (cnt > 0) { /* bottom part residual */ uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSLo); uint32x4_t vTarg = vld1q(pTargetBaseCur); mve_pred16_t p = vctp32q(cnt); vchSrc = vandq(vchSrc, vBitMaskLo); vTarg = vpselq(vmvnq(vTarg), vBgColor, vcmpneq_n_u32(vchSrc, 0)); vst1q_p(pTargetBaseCur, vTarg, p); } pchSourceBase += (iSourceStride >> 3); pTargetBase += iTargetStride; } } } #ifdef EXPERIMENTAL /*----------------------------------------------------------------------------* * Misc & Experimental * *----------------------------------------------------------------------------*/ #if __ARM_2D_HAS_HELIUM_FLOAT__ int16_t __arm_2d_bilinear_interp_rgb16_f16( const uint16_t *phwSourceBase, int16_t iSourceStride, arm_2d_size_t *ptSourceSize, float16_t X, float16_t Y) { float16_t out; int16_t xIndex, yIndex, index; float16_t xdiff, ydiff; float16_t b1, b2, b3, b4; __arm_2d_color_fast_rgb_t packed00, packed01, packed10, packed11; xIndex = (int16_t) X; yIndex = (int16_t) Y; #ifdef INTERP_BOUND_CHECK /* Care taken for table outside boundary */ /* Returns zero output when values are outside table boundary */ if (xIndex < 0 || xIndex > (ptSourceSize->iWidth - 2) || yIndex < 0 || yIndex > (ptSourceSize->iHeight - 2)) { return (0); } #endif /* Calculation of index for two nearest points in X-direction */ index = (xIndex) + (yIndex) * iSourceStride; /* Read two nearest points in X-direction */ __arm_2d_rgb565_unpack(phwSourceBase[index], &packed00); __arm_2d_rgb565_unpack(phwSourceBase[index + 1], &packed01); /* Calculation of index for two nearest points in Y-direction */ index = (xIndex) + (yIndex + 1) * iSourceStride; /* Read two nearest points in Y-direction */ __arm_2d_rgb565_unpack(phwSourceBase[index], &packed10); __arm_2d_rgb565_unpack(phwSourceBase[index + 1], &packed11); const uint16_t *p800 = (uint16_t *) & packed00; const uint16_t *p801 = (uint16_t *) & packed01; const uint16_t *p810 = (uint16_t *) & packed10; const uint16_t *p811 = (uint16_t *) & packed11; __arm_2d_color_fast_rgb_t wTargetPixel; uint_fast8_t n = sizeof(uint32_t) - 1; /* remove alpha */ uint16_t *pchDes = (uint16_t *) & wTargetPixel; /* interpolate individual component */ do { float16_t f00, f01, f10, f11; f00 = (float16_t) * p800++; f01 = (float16_t) * p801++; f10 = (float16_t) * p810++; f11 = (float16_t) * p811++; /* Calculation of intermediate values */ b1 = f00; b2 = f01 - f00; b3 = f10 - f00; b4 = f00 - f01 - f10 + f11; /* Calculation of fractional part in X */ xdiff = X - xIndex; /* Calculation of fractional part in Y */ ydiff = Y - yIndex; /* Calculation of bi-linear interpolated output */ out = b1 + b2 * xdiff + b3 * ydiff + b4 * xdiff * ydiff; /* convert back in byte */ *pchDes++ = (uint16_t) out; } while (--n); return (__arm_2d_rgb565_pack(&wTargetPixel)); } int32_t __arm_2d_bilinear_interp_rgb32_f16( const uint32_t *phwSourceBase, int16_t iSourceStride, arm_2d_size_t *ptSourceSize, float16_t X, float16_t Y) { float16_t out; int16_t xIndex, yIndex, index; float16_t xdiff, ydiff; float16_t b1, b2, b3, b4; uint32_t packed00, packed01, packed10, packed11; xIndex = (int16_t) X; yIndex = (int16_t) Y; #ifdef INTERP_BOUND_CHECK /* Care taken for table outside boundary */ /* Returns zero output when values are outside table boundary */ if (xIndex < 0 || xIndex > (ptSourceSize->iWidth - 2) || yIndex < 0 || yIndex > (ptSourceSize->iHeight - 2)) { return (0); } #endif /* Calculation of index for two nearest points in X-direction */ index = (xIndex) + (yIndex) * iSourceStride; /* Read two nearest points in X-direction */ packed00 = phwSourceBase[index]; packed01 = phwSourceBase[index + 1]; /* Calculation of index for two nearest points in Y-direction */ index = (xIndex) + (yIndex + 1) * iSourceStride; /* Read two nearest points in Y-direction */ packed10 = phwSourceBase[index]; packed11 = phwSourceBase[index + 1]; const uint8_t *p800 = (uint8_t *) & packed00; const uint8_t *p801 = (uint8_t *) & packed01; const uint8_t *p810 = (uint8_t *) & packed10; const uint8_t *p811 = (uint8_t *) & packed11; uint32_t wTargetPixel; uint_fast8_t n = sizeof(uint32_t); uint8_t *pchDes = (uint8_t *) & wTargetPixel; /* interpolate individual component */ do { float16_t f00, f01, f10, f11; f00 = (float16_t) * p800++; f01 = (float16_t) * p801++; f10 = (float16_t) * p810++; f11 = (float16_t) * p811++; /* Calculation of intermediate values */ b1 = f00; b2 = f01 - f00; b3 = f10 - f00; b4 = f00 - f01 - f10 + f11; /* Calculation of fractional part in X */ xdiff = X - xIndex; /* Calculation of fractional part in Y */ ydiff = Y - yIndex; /* Calculation of bi-linear interpolated output */ out = b1 + b2 * xdiff + b3 * ydiff + b4 * xdiff * ydiff; /* convert back in byte */ *pchDes++ = (uint8_t) out; } while (--n); return wTargetPixel; } void __arm_2d_rgb16_scale(uint16_t * phwSourceBase, int16_t iSourceStride, arm_2d_size_t * ptSourceSize, uint16_t * phwTargetBase, int16_t iTargetStride, arm_2d_size_t * ptTargetSize) { #if !defined(TESTING) //! todo #else /* horizontal & vertical scale factors to create interpolated from source dimension grid */ /* limit border dimension to ensure source neighbouring points do not exceed limit */ float16_t scaleX = (float16_t) (ptSourceSize->iWidth - 2) / (float16_t) (ptTargetSize->iWidth - 1); float16_t scaleY = (float16_t) (ptSourceSize->iHeight - 2) / (float16_t) (ptTargetSize->iHeight - 1); for (int32_t y = 0; y < ptTargetSize->iHeight; y++) { for (int32_t x = 0; x < ptTargetSize->iWidth; x++) { phwTargetBase[x] = __arm_2d_bilinear_interp_rgb16_f16(phwSourceBase, iSourceStride, ptSourceSize, x * scaleX, y * scaleY); } phwTargetBase += iTargetStride; } #endif } void __arm_2d_rgb32_scale(uint32_t * phwSourceBase, int16_t iSourceStride, arm_2d_size_t * ptSourceSize, uint32_t * phwTargetBase, int16_t iTargetStride, arm_2d_size_t * ptTargetSize) { #if !defined(TESTING) //! todo #else /* horizontal & vertical scale factors to create interpolated from source dimension grid */ /* limit border dimension to ensure source neighbouring points do not exceed limit */ float16_t scaleX = (float16_t) (ptSourceSize->iWidth - 2) / (float16_t) (ptTargetSize->iWidth - 1); float16_t scaleY = (float16_t) (ptSourceSize->iHeight - 2) / (float16_t) (ptTargetSize->iHeight - 1); for (int32_t y = 0; y < ptTargetSize->iHeight; y++) { for (int32_t x = 0; x < ptTargetSize->iWidth; x++) { phwTargetBase[x] = __arm_2d_bilinear_interp_rgb32_f16(phwSourceBase, iSourceStride, ptSourceSize, x * scaleX, y * scaleY); } phwTargetBase += iTargetStride; } #endif } #endif /* * rotation trial */ #define PIBy180_Q30 18740330 #define Q0_TO_Q16(x) ((x) << 16) #define CEIL_Q16_TO_Q0(x) ((x >> 16) + (x & 0xffff ? 1 : 0)) #define ROUND_Q16_TO_Q0(x) ((x + (1 << 15)) >> 16) #define PRINTDBG(x,...) int32_t __arm_fxpt_mult32(int32_t x, int32_t y, const int32_t scal) { int32_t tmp; tmp = (q31_t) (((q63_t) x * y ) >> 32); return tmp << scal; } typedef struct arm_2d_shear_rotate_info { int32_t tanHalfAngle; int32_t sinAngle; int32_t corner[3]; int32_t newDim[3]; } arm_2d_shear_rotate_info; void __arm_2d_rgb32_fill_line( uint32_t * pTargetBase, uint32_t len, uint32_t filler) { while(len > 0) { *pTargetBase++ = filler; len--; } } #ifdef MVE void __arm_2d_rgb32_fill_col( uint32_t * pBase, uint32_t stride, uint32_t blockSize, uint32_t filler) { uint32x4_t idx = vidupq_u32((uint32_t) 0, 1); uint32x4_t pattern = vdupq_n_u32(filler); uint32_t incr = stride * 4; idx = vmulq_n_u32(idx, stride); do { mve_pred16_t p = vctp32q(blockSize); vstrwq_scatter_shifted_offset_p_u32(pBase, idx, pattern, p); idx = vaddq(idx, incr); blockSize -= 4; } while ((int32_t) blockSize > 0); } #else void __arm_2d_rgb32_fill_col( uint32_t * pTargetBase, uint32_t stride, uint32_t len, uint32_t filler) { while(len > 0) { *pTargetBase = filler; pTargetBase += stride; len--; } } #endif #ifdef MVE void __arm_2d_rgb32_move_col( uint32_t * pBase, uint32_t stride, uint32_t offset, uint32_t blockSize) { if (offset == 0) return; if (offset > 0) { // need to operate on the reverse direction // to avoid write corrupting unread samples uint32x4_t idx = vddupq_u32((uint32_t) 3, 1); uint32_t incr = stride * 4; /* jump 4 rows */ uint32_t *pDst = pBase + stride * offset; /* build vector with consective row index */ idx = vmulq_n_u32(idx, stride); /* move index to the bottom */ /* move from bottom in backward direction */ idx = idx + ((blockSize - 4) * stride); do { mve_pred16_t p = vctp32q(blockSize); uint32x4_t item = vldrwq_gather_shifted_offset_z_u32(pBase, idx, p); vstrwq_scatter_shifted_offset_p_u32(pDst, idx, item, p); idx = vsubq(idx, incr); blockSize -= 4; } while ((int32_t) blockSize > 0); } } #else void __arm_2d_rgb32_move_col( uint32_t * pTargetBase, uint32_t stride, uint32_t offset, uint32_t len) { uint32_t * pSrc = pTargetBase + (len-1)*stride; uint32_t * pDst = pSrc + offset*stride; while(len > 0) { *pDst = *pSrc; pDst-= stride; pSrc-= stride; len--; } } #endif void __arm_2d_rgb32_move_line( uint32_t * pTargetBase, uint32_t offset, uint32_t len) { uint32_t * pSrc = pTargetBase + (len-1); uint32_t * pDst = pSrc + offset; while(len > 0) { *pDst = *pSrc; pDst--; pSrc--; len--; } } int32_t __tanFxQ30(int32_t in) { #define TANC3 98947010 /* 0.092151584 */ #define TANC2 126772778 /* 0.118066350 */ #define TANC1 359662342 /* 0.334961658 */ #define TANC0 1073741824 /* 1 */ static const int32_t tanFxTab[4] = {TANC0, TANC1, TANC2, TANC3}; int32_t in2 = __arm_fxpt_mult32(in,in, 2); int32_t acc; acc = __arm_fxpt_mult32(tanFxTab[3], in2, 2); acc = tanFxTab[2] + acc; acc = __arm_fxpt_mult32(acc, in2, 2); acc = tanFxTab[1] + acc; acc = __arm_fxpt_mult32(acc, in2, 2); acc = tanFxTab[0] + acc; return __arm_fxpt_mult32(acc, in, 2); } int32_t __sinFxQ30(int32_t in) { #define SINC3 -209544 /* 0.000195152832 */ #define SINC2 8946589 /* 0.00833216076 */ #define SINC1 -178956841 /* 0.166666546 */ #define SINC0 1073741824 /* 1 */ //sin(x) approx x - 0.166666546x^3 + 0.00833216076x^5 -0.000195152832x^7 static const int32_t sinFxTab[4] = {SINC0, SINC1, SINC2, SINC3}; int32_t in2 = __arm_fxpt_mult32(in,in, 2); int32_t acc; acc = __arm_fxpt_mult32(sinFxTab[3], in2, 2); acc = sinFxTab[2] + acc; acc = __arm_fxpt_mult32(acc, in2, 2); acc = sinFxTab[1] + acc; acc = __arm_fxpt_mult32(acc, in2, 2); acc = sinFxTab[0] + acc; return __arm_fxpt_mult32(acc, in, 2); } void __arm_2d_3shear_rotate_get_target_dim_flt( arm_2d_size_t * ptSourceSize, arm_2d_size_t * ptTargetSize, int16_t angle) { float32_t angleR = (float32_t) angle / 180.0f * PI; float32_t tanAngleHalf = tanf(angleR / 2); float32_t sinAngle = sinf(angleR); float32_t colF = (float32_t) ptSourceSize->iWidth; float32_t rowF = (float32_t) ptSourceSize->iHeight; float32_t centerX = (float32_t) (ptSourceSize->iWidth / 2); float32_t centerY = (float32_t) (ptSourceSize->iHeight / 2); // X shearing effects float32_t topRightX = colF - centerX - tanAngleHalf * (1.0f - centerY); float32_t bottomLX = 1.0f - centerX - tanAngleHalf * (rowF - centerY); float32_t newWidth = ceilf(topRightX - bottomLX + 1.0f); PRINTDBG(" ** iWidth %d iHeight %d ***\n", ptSourceSize->iWidth, ptSourceSize->iHeight); for (int i = 0; i < ptSourceSize->iHeight; i++) { int32_t posYsrc = 1 + i - (int) centerY; int32_t startxI; float32_t startxF = 1.0f - centerX - tanAngleHalf * (float32_t) posYsrc; startxF = startxF - bottomLX; startxF = floorf(startxF + 0.5f); startxI = (int) startxF; PRINTDBG("startxI %d\n", startxI); } /* Y shearing */ colF = newWidth; /* rowF source unchanged */ /* get source image center */ centerX = centerY; centerY = floorf(colF / 2.0f); float32_t topLeftX = (1.0f - centerX) * sinAngle + (1.0f - centerY); float32_t bottomRX = (colF - centerX) * sinAngle + (rowF - centerY); float32_t newHeight = ceilf(bottomRX - topLeftX + 1.0f); PRINTDBG(" -- yshear %f --\n", newHeight); for (int i = 0; i < newWidth; i++) { int32_t posXsrc = 1 + i - (int) centerX; int32_t startyI; float32_t startyF = 1.0f + sinAngle * (float32_t) posXsrc - centerY; startyF = startyF - topLeftX; startyF = floorf(startyF + 0.5f); startyI = (int) startyF; PRINTDBG("startyF %d\n", startyI); } /* X shearing */ colF = newWidth; rowF = newHeight; /* get source image center */ centerX = floorf(newWidth / 2.0f); centerY = floorf(newHeight / 2.0f); topRightX = colF - centerX - tanAngleHalf * (1.0f - centerY); bottomLX = 1.0f - centerX - tanAngleHalf * (rowF - centerY); float32_t finalWidth = ceilf(topRightX - bottomLX + 1.0f); PRINTDBG(" -- xshear %f --\n", finalWidth); for (int i = 0; i < (int) newHeight; i++) { int32_t posYsrc = 1 + i - (int) centerY; int32_t startxI; float32_t startxF = 1.0f - centerX - tanAngleHalf * (float32_t) posYsrc; startxF = startxF - bottomLX; startxF = floorf(startxF + 0.5f); startxI = (int) startxF; PRINTDBG("startx %d \n", startxI); } ptTargetSize->iHeight = (int) newHeight; ptTargetSize->iWidth = (int) finalWidth; } void __arm_2d_3shear_rotate_get_target_dim_fx( arm_2d_size_t * ptSourceSize, arm_2d_size_t * ptTargetSize, arm_2d_shear_rotate_info * ptShearRot, int16_t angle) { int32_t angleR = angle * PIBy180_Q30; int32_t tanHalfAngle = __tanFxQ30(angleR >> 1); int32_t sinAngle = __sinFxQ30(angleR); int32_t srcIWidth = ptSourceSize->iWidth; int32_t srcIWidthHalf = ptSourceSize->iWidth / 2; int32_t srcIHeight = ptSourceSize->iHeight; int32_t srcIHeightHalf = ptSourceSize->iHeight / 2; int32_t Right, Left; PRINTDBG(" ** iWidth %d iHeight %d ***\n", ptSourceSize->iWidth, ptSourceSize->iHeight); Right = Q0_TO_Q16(1 - srcIHeightHalf); Right = __arm_fxpt_mult32(Right, tanHalfAngle, 2); Right = Q0_TO_Q16(srcIWidth - srcIWidthHalf) - Right; Left = Q0_TO_Q16(srcIHeight - srcIHeightHalf); Left = __arm_fxpt_mult32(Left, tanHalfAngle, 2); Left = Q0_TO_Q16(1 - srcIWidthHalf) - Left; int32_t newWidth = Right - Left + Q0_TO_Q16(1); newWidth = CEIL_Q16_TO_Q0(newWidth); ptShearRot->tanHalfAngle = tanHalfAngle; ptShearRot->sinAngle = sinAngle; ptShearRot->corner[0] = Left; ptShearRot->newDim[0] = newWidth; #if DBG printf("--newWidth %d \n", newWidth); for (int i = 0; i < ptSourceSize->iHeight; i++) { int32_t posYsrc = 1 + i - srcIHeightHalf; int32_t startxF = Q0_TO_Q16(1 - srcIWidthHalf) - __arm_fxpt_mult32(tanHalfAngle, Q0_TO_Q16(posYsrc), 2); startxF = startxF - Left; startxF = ROUND_Q16_TO_Q0(startxF); printf("startxI %d\n", startxF); } #endif /* Y shearing */ Left = Q0_TO_Q16(1 - srcIHeightHalf); Left = __arm_fxpt_mult32(Left, sinAngle, 2); Left = Left + Q0_TO_Q16(1 - newWidth / 2); Right = Q0_TO_Q16(newWidth - srcIHeightHalf); Right = __arm_fxpt_mult32(Right, sinAngle, 2); Right = Right + Q0_TO_Q16(srcIHeight - newWidth / 2); int32_t newHeight = Right - Left + Q0_TO_Q16(1); newHeight = CEIL_Q16_TO_Q0(newHeight); ptShearRot->corner[1] = Left; ptShearRot->newDim[1] = newHeight; #if DBG printf("--newHeight %d \n", newHeight); for (int i = 0; i < newWidth; i++) { int32_t posXsrc = 1 + i - srcIHeightHalf; int32_t startyF = Q0_TO_Q16(1 - newWidth / 2) + __arm_fxpt_mult32(sinAngle, Q0_TO_Q16(posXsrc), 2); startyF = startyF - Left; startyF = ROUND_Q16_TO_Q0(startyF); printf(" startxI %d\n", startyF); } #endif /* X shearing */ int32_t newHeightHalf = newHeight / 2; int32_t newWidthHalf = newWidth / 2; Right = Q0_TO_Q16(1 - newHeightHalf); Right = __arm_fxpt_mult32(Right, tanHalfAngle, 2); Right = Q0_TO_Q16(newWidth - newWidthHalf) - Right; Left = Q0_TO_Q16(newHeight - newHeightHalf); Left = __arm_fxpt_mult32(Left, tanHalfAngle, 2); Left = Q0_TO_Q16(1 - newWidthHalf) - Left; int32_t finalWidth = Right - Left + Q0_TO_Q16(1); finalWidth = CEIL_Q16_TO_Q0(finalWidth); ptShearRot->corner[2] = Left; ptShearRot->newDim[2] = finalWidth; ptTargetSize->iHeight = newHeight; ptTargetSize->iWidth = finalWidth; } void __arm_2d_rgb32_rotate_fx(uint32_t * phwSourceBase, int16_t iSourceStride, arm_2d_size_t * ptSourceSize, uint32_t * phwTargetBase, int16_t iTargetStride, arm_2d_size_t * ptTargetSize, arm_2d_shear_rotate_info * ptShearRot, uint32_t filler) { int32_t srcIWidthHalf = ptSourceSize->iWidth / 2; int32_t srcIHeightHalf = ptSourceSize->iHeight / 2; int32_t tanHalfAngle = ptShearRot->tanHalfAngle; int32_t sinAngle = ptShearRot->sinAngle; int32_t corner; uint32_t *pTarget; // x shearing PRINTDBG("X shear \n"); int8_t base[16*4]; int8_t kernel[16*4]; for(int i=0;i<64;i++) { base[i]=i; kernel[i]=64+i; } corner = ptShearRot->corner[0]; pTarget = phwTargetBase; for (int i = 0; i < ptSourceSize->iHeight; i++) { uint32_t *pDst; int32_t posYsrc = 1 + i - srcIHeightHalf; int32_t start = Q0_TO_Q16(1 - srcIWidthHalf) - __arm_fxpt_mult32(tanHalfAngle, Q0_TO_Q16(posYsrc), 2); start = start - corner; start = ROUND_Q16_TO_Q0(start); PRINTDBG("startxI %d\n", start); int32_t residual = ptTargetSize->iWidth - start - ptSourceSize->iWidth; pDst = pTarget; __arm_2d_rgb32_fill_line(pDst, start, filler); pDst += start; memcpy(pDst, phwSourceBase, ptSourceSize->iWidth * sizeof(uint32_t)); pDst += ptSourceSize->iWidth; __arm_2d_rgb32_fill_line(pDst, residual, filler); pTarget += iTargetStride; phwSourceBase += iSourceStride; } // Y shearing PRINTDBG("Y shear \n"); int32_t newWidth = ptShearRot->newDim[0]; corner = ptShearRot->corner[1]; pTarget = phwTargetBase; for (int i = 0; i < newWidth; i++) { int32_t posXsrc = 1 + i - srcIHeightHalf; int32_t start = Q0_TO_Q16(1 - newWidth / 2) + __arm_fxpt_mult32(sinAngle, Q0_TO_Q16(posXsrc), 2); start = start - corner; start = ROUND_Q16_TO_Q0(start); PRINTDBG(" startxI %d\n", start); int32_t residual = ptTargetSize->iHeight - start - ptSourceSize->iHeight; __arm_2d_rgb32_move_col(pTarget, iTargetStride, start, ptSourceSize->iHeight); __arm_2d_rgb32_fill_col(pTarget, iTargetStride, start, filler); __arm_2d_rgb32_fill_col(pTarget + (start + ptSourceSize->iHeight) * iTargetStride, iTargetStride, residual, filler); pTarget++; } PRINTDBG("X shear \n"); // X shearing int32_t newHeight = ptShearRot->newDim[1]; int32_t newHeightHalf = newHeight / 2; int32_t newWidthHalf = newWidth / 2; int32_t finalWidth = ptShearRot->newDim[2]; corner = ptShearRot->corner[2]; pTarget = phwTargetBase; for (int i = 0; i < newHeight; i++) { int32_t posYsrc = 1 + i - newHeightHalf; int32_t start = Q0_TO_Q16(1 - newWidthHalf) - __arm_fxpt_mult32(tanHalfAngle, Q0_TO_Q16(posYsrc), 2); start = start - corner; start = ROUND_Q16_TO_Q0(start); PRINTDBG(" startxI %d\n", start); int32_t residual = finalWidth - start - newWidth; __arm_2d_rgb32_move_line(pTarget, start, newWidth); __arm_2d_rgb32_fill_line(pTarget, start, filler); __arm_2d_rgb32_fill_line(pTarget + start + newWidth, residual, filler); pTarget += iTargetStride; } } #endif #if defined(__clang__) # pragma clang diagnostic pop #endif #ifdef __cplusplus } #endif #endif // __ARM_2D_HAS_HELIUM__