mirror of
https://gitee.com/Lyon1998/pikapython.git
synced 2025-01-22 17:12:55 +08:00
6549 lines
277 KiB
C
6549 lines
277 KiB
C
/*
|
|
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the License); you may
|
|
* not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
|
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
/* ----------------------------------------------------------------------
|
|
* Project: Arm-2D Library
|
|
* Title: arm-2d_helium.c
|
|
* Description: Acceleration extensions using Helium.
|
|
*
|
|
* $Date: 22. Sep 2021
|
|
* $Revision: V.0.12.0
|
|
*
|
|
* Target Processor: Cortex-M cores with Helium
|
|
*
|
|
* -------------------------------------------------------------------- */
|
|
|
|
#define __ARM_2D_IMPL__
|
|
|
|
#include "arm_2d.h"
|
|
#include "__arm_2d_impl.h"
|
|
|
|
#if defined(__ARM_2D_HAS_HELIUM__) && __ARM_2D_HAS_HELIUM__
|
|
|
|
#if defined(__clang__)
|
|
# pragma clang diagnostic push
|
|
# pragma clang diagnostic ignored "-Wunknown-warning-option"
|
|
# pragma clang diagnostic ignored "-Wreserved-identifier"
|
|
# pragma clang diagnostic ignored "-Wincompatible-pointer-types-discards-qualifiers"
|
|
# pragma clang diagnostic ignored "-Wcast-qual"
|
|
# pragma clang diagnostic ignored "-Wcast-align"
|
|
# pragma clang diagnostic ignored "-Wextra-semi-stmt"
|
|
# pragma clang diagnostic ignored "-Wsign-conversion"
|
|
# pragma clang diagnostic ignored "-Wunused-function"
|
|
# pragma clang diagnostic ignored "-Wimplicit-int-float-conversion"
|
|
# pragma clang diagnostic ignored "-Wdouble-promotion"
|
|
# pragma clang diagnostic ignored "-Wunused-parameter"
|
|
# pragma clang diagnostic ignored "-Wimplicit-float-conversion"
|
|
# pragma clang diagnostic ignored "-Wimplicit-int-conversion"
|
|
# pragma clang diagnostic ignored "-Wtautological-pointer-compare"
|
|
# pragma clang diagnostic ignored "-Wmissing-prototypes"
|
|
# pragma clang diagnostic ignored "-Wsign-compare"
|
|
# pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
|
|
# pragma clang diagnostic ignored "-Wpadded"
|
|
# pragma clang diagnostic ignored "-Wvector-conversion"
|
|
# pragma clang diagnostic ignored "-Wundef"
|
|
#endif
|
|
|
|
|
|
|
|
#include "__arm_2d_paving_helium.h"
|
|
#include "__arm_2d_math_helium.h"
|
|
#include "__arm_2d_utils_helium.h"
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
|
|
|
|
|
|
/*! \brief initialise the helium service service
|
|
*! \param none
|
|
*! \return none
|
|
*/
|
|
void __arm_2d_helium_init(void)
|
|
{
|
|
/* even if this is empty, do not remove it */
|
|
}
|
|
|
|
|
|
/*----------------------------------------------------------------------------*
|
|
* Code Template for tile operations *
|
|
*----------------------------------------------------------------------------*/
|
|
|
|
#define __API_COLOUR c8bit
|
|
#define __API_INT_TYPE uint8_t
|
|
#define __API_INT_TYPE_BIT_NUM 8
|
|
|
|
#include "__arm_2d_copy_helium.inc"
|
|
|
|
#define __API_COLOUR rgb16
|
|
#define __API_INT_TYPE uint16_t
|
|
#define __API_INT_TYPE_BIT_NUM 16
|
|
|
|
#include "__arm_2d_copy_helium.inc"
|
|
|
|
|
|
#define __API_COLOUR rgb32
|
|
#define __API_INT_TYPE uint32_t
|
|
#define __API_INT_TYPE_BIT_NUM 32
|
|
|
|
#include "__arm_2d_copy_helium.inc"
|
|
|
|
|
|
|
|
/*----------------------------------------------------------------------------*
|
|
* Specialized Copy Routines *
|
|
*----------------------------------------------------------------------------*/
|
|
|
|
|
|
static
|
|
void __arm_copy_16_mve_narrow( uint16_t *phwSource,
|
|
int16_t iSourceStride,
|
|
uint16_t *phwTarget,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *ptCopySize)
|
|
{
|
|
#ifdef USE_MVE_INTRINSICS
|
|
for (int32_t x = 0; x < ptCopySize->iWidth; x++) {
|
|
uint16x8_t srcStr = vidupq_u16((uint32_t) 0, 1);
|
|
uint16x8_t dstStr = vidupq_u16((uint32_t) 0, 1);
|
|
|
|
srcStr = srcStr * iSourceStride;
|
|
dstStr = dstStr * iTargetStride;
|
|
|
|
for (int32_t y = 0; y < ptCopySize->iHeight / 8; y++) {
|
|
uint16x8_t in = vldrhq_gather_shifted_offset_u16(phwSource, srcStr);
|
|
srcStr = vaddq_n_u16(srcStr, (8 * iSourceStride));
|
|
|
|
vstrhq_scatter_shifted_offset_u16(phwTarget, dstStr, in);
|
|
dstStr = vaddq_n_u16(dstStr, (8 * iTargetStride));
|
|
}
|
|
|
|
phwSource++;
|
|
phwTarget++;
|
|
}
|
|
#else
|
|
__asm volatile(
|
|
" clrm {r2, r4} \n"
|
|
" vidup.u16 q0, r2, #1 \n"
|
|
" vmul.i16 q2, q0, %[iSourceStride] \n"
|
|
" vidup.u16 q1, r4, #1 \n"
|
|
" vmul.i16 q3, q1, %[iTargetStride] \n"
|
|
|
|
"3: \n"
|
|
/* outer loop, iterates over columns */
|
|
/* size = ptCopySize->iWidth */
|
|
" vmov q0, q2 \n"
|
|
" vmov q1, q3 \n"
|
|
|
|
/* inner loop, iterates over rows (size = ptCopySize->iHeight) */
|
|
" wlstp.16 lr, %[iHeight], 1f \n"
|
|
".p2align 2 \n"
|
|
"2: \n"
|
|
" vldrh.u16 q4, [%[phwSource], q0, uxtw #1] \n"
|
|
" vadd.i16 q0, q0, %[iSourceStridex8] \n"
|
|
" vstrh.16 q4, [%[phwTarget], q1, uxtw #1] \n"
|
|
" vadd.i16 q1, q1, %[iTargetStridex8] \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
" add.n %[phwSource], #2 \n"
|
|
" add.n %[phwTarget], #2 \n"
|
|
" subs %[iWidth], #1 \n"
|
|
" bne 3b \n"
|
|
|
|
: [phwTarget] "+r"(phwTarget), [phwSource] "+r"(phwSource)
|
|
: [iHeight] "r"(ptCopySize->iHeight), [iWidth] "r" (ptCopySize->iWidth),
|
|
[iSourceStride] "r" (iSourceStride),[iSourceStridex8] "r" (iSourceStride*8),
|
|
[iTargetStride] "r" (iTargetStride),[iTargetStridex8] "r" (iTargetStride*8)
|
|
: "r2", "r4", "q0", "q1", "q2", "q3", "q4", "memory", "r14", "cc");
|
|
#endif
|
|
}
|
|
|
|
static
|
|
void __arm_copy_32_mve_narrow( uint32_t *pwSource,
|
|
int32_t iSourceStride,
|
|
uint32_t *pwTarget,
|
|
int32_t iTargetStride,
|
|
arm_2d_size_t *ptCopySize)
|
|
{
|
|
#ifdef USE_MVE_INTRINSICS
|
|
for (int_fast32_t x = 0; x < ptCopySize->iWidth; x++) {
|
|
uint32x4_t srcStr = vidupq_u32((uint32_t) 0, 1);
|
|
uint32x4_t dstStr = vidupq_u32((uint32_t) 0, 1);
|
|
|
|
srcStr = srcStr * iSourceStride;
|
|
dstStr = dstStr * iTargetStride;
|
|
|
|
for (int_fast32_t y = 0; y < ptCopySize->iHeight / 4; y++) {
|
|
uint32x4_t in = vldrwq_gather_shifted_offset_u32(pwSource, srcStr);
|
|
srcStr = vaddq_n_u32(srcStr, (4 * iSourceStride));
|
|
|
|
vstrwq_scatter_shifted_offset_u32(pwTarget, dstStr, in);
|
|
dstStr = vaddq_n_u32(dstStr, (4 * iTargetStride));
|
|
}
|
|
|
|
pwSource++;
|
|
pwTarget++;
|
|
}
|
|
#else
|
|
__asm volatile(
|
|
|
|
" clrm {r2, r4} \n"
|
|
" vidup.u32 q0, r2, #1 \n"
|
|
" vmul.i32 q2, q0, %[iSourceStride] \n"
|
|
" vidup.u32 q1, r4, #1 \n"
|
|
" vmul.i32 q3, q1, %[iTargetStride] \n"
|
|
|
|
"3: \n"
|
|
/* outer loop, iterates over columns */
|
|
/* size = ptCopySize->iWidth */
|
|
" vmov q0, q2 \n"
|
|
" vmov q1, q3 \n"
|
|
|
|
/* inner loop, iterates over rows (size = ptCopySize->iHeight) */
|
|
" wlstp.32 lr, %[iHeight], 1f \n"
|
|
".p2align 2 \n"
|
|
"2: \n"
|
|
" vldrw.u32 q4, [%[pwSource], q0, uxtw #2] \n"
|
|
" vadd.i32 q0, q0, %[iSourceStridex4] \n"
|
|
" vstrw.32 q4, [%[pwTarget], q1, uxtw #2] \n"
|
|
" vadd.i32 q1, q1, %[iTargetStridex4] \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
" add.n %[pwSource], #4 \n"
|
|
" add.n %[pwTarget], #4 \n"
|
|
" subs %[iWidth], #1 \n"
|
|
" bne 3b \n"
|
|
|
|
: [pwTarget] "+r"(pwTarget), [pwSource] "+r"(pwSource)
|
|
: [iHeight] "r"(ptCopySize->iHeight), [iWidth] "r" (ptCopySize->iWidth),
|
|
[iSourceStride] "r" (iSourceStride),[iSourceStridex4] "r" (iSourceStride*4),
|
|
[iTargetStride] "r" (iTargetStride),[iTargetStridex4] "r" (iTargetStride*4)
|
|
: "r2", "r4", "q0", "q1", "q2", "q3", "q4", "memory", "r14", "cc");
|
|
#endif
|
|
}
|
|
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_rgb16_copy( uint16_t *phwSource,
|
|
int16_t iSourceStride,
|
|
uint16_t *phwTarget,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *ptCopySize)
|
|
{
|
|
/*
|
|
* 16-bit Narrow copy case:
|
|
* use column copy with scatter / gather
|
|
*/
|
|
if(ptCopySize->iWidth <= 4) {
|
|
__arm_copy_16_mve_narrow(phwSource,
|
|
iSourceStride,
|
|
phwTarget,
|
|
iTargetStride,
|
|
ptCopySize);
|
|
|
|
} else if((((uint32_t)phwSource & 3) == 0) && (((uint32_t)phwTarget & 3) == 0)
|
|
&& ((iSourceStride & 3) == 0) && ((iTargetStride & 3) ==0)) {
|
|
/*
|
|
* source / dst & strides are 64-bit aligned
|
|
* use scalar LDRD/STRD, faster than back to back vector VLDR/VSTR on M55
|
|
*/
|
|
__asm volatile(
|
|
"3: \n"
|
|
|
|
" mov r0, %[phwSource] \n"
|
|
" mov r1, %[phwTarget] \n"
|
|
|
|
/* scalar version faster (no DTCM bank conflict)*/
|
|
" wls lr, %[iWidth], 1f \n"
|
|
".p2align 2 \n"
|
|
"2: \n"
|
|
" ldrd r2, r3, [r0], #8 \n"
|
|
" strd r2, r3, [r1], #8 \n"
|
|
" le lr, 2b \n"
|
|
"1: \n"
|
|
// tail
|
|
" wls lr, %[iWidthTail], 1f \n"
|
|
".p2align 2 \n"
|
|
"2: \n"
|
|
" ldrh r2, [r0], #2 \n"
|
|
" strh r2, [r1], #2 \n"
|
|
" le lr, 2b \n"
|
|
"1: \n"
|
|
|
|
" add %[phwSource], %[iSourceStride] \n"
|
|
" add %[phwTarget], %[iTargetStride] \n"
|
|
" subs %[iHeight], #1 \n"
|
|
" bne 3b \n"
|
|
|
|
: [phwTarget] "+r"(phwTarget), [phwSource] "+r"(phwSource)
|
|
: [iHeight] "r"(ptCopySize->iHeight), [iWidth] "r" (ptCopySize->iWidth/4),
|
|
[iWidthTail] "r" (ptCopySize->iWidth & 3),
|
|
[iSourceStride] "r" (iSourceStride*sizeof(uint16_t)),
|
|
[iTargetStride] "r" (iTargetStride*sizeof(uint16_t))
|
|
: "r0", "r1", "r2", "r3", "q0", "memory", "r14", "cc"
|
|
);
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* generic column major 16-bit 2D copy
|
|
*/
|
|
int32_t iWidth = ptCopySize->iWidth;
|
|
int32_t iHeight = ptCopySize->iHeight;
|
|
|
|
__asm volatile(
|
|
" mov r2, %[iHeight] \n"
|
|
"3: \n"
|
|
|
|
" mov r0, %[phwSource] \n"
|
|
" mov r1, %[phwTarget] \n"
|
|
|
|
" wlstp.16 lr, %[iWidth], 1f \n"
|
|
".p2align 2 \n"
|
|
"2: \n"
|
|
" vldrh.u16 q0, [r0], #16 \n"
|
|
" vstrh.16 q0, [r1], #16 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
" add %[phwSource], %[iSourceStride] \n"
|
|
" add %[phwTarget], %[iTargetStride] \n"
|
|
" subs r2, #1 \n"
|
|
" bne 3b \n"
|
|
|
|
: [phwTarget] "+r"(phwTarget), [phwSource] "+r"(phwSource)
|
|
: [iHeight] "r"(iHeight), [iWidth] "r" (iWidth),
|
|
[iSourceStride] "r" (iSourceStride*sizeof(uint16_t)),
|
|
[iTargetStride] "r" (iTargetStride*sizeof(uint16_t))
|
|
: "r0", "r1", "r2", "q0", "memory", "r14", "cc");
|
|
}
|
|
}
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_rgb32_copy( uint32_t *pwSource,
|
|
int16_t iSourceStride,
|
|
uint32_t *pwTarget,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *ptCopySize)
|
|
{
|
|
if(ptCopySize->iWidth <= 2) {
|
|
/*
|
|
* 32-bit Narrow copy case:
|
|
* use column copy with scatter / gather
|
|
*/
|
|
__arm_copy_32_mve_narrow(pwSource,
|
|
iSourceStride,
|
|
pwTarget,
|
|
iTargetStride,
|
|
ptCopySize);
|
|
|
|
} else if((((uint32_t)pwSource & 3) == 0) && (((uint32_t)pwTarget & 3) == 0)
|
|
&& ((iSourceStride & 3) == 0) && ((iTargetStride & 3) ==0)) {
|
|
/*
|
|
* source / dst & strides are 64-bit aligned
|
|
* use scalar LDRD/STRD, faster than back to back vector VLDR/VSTR on M55
|
|
*/
|
|
__asm volatile(
|
|
"3: \n"
|
|
" mov r0, %[pwSource] \n"
|
|
" mov r1, %[pwTarget] \n"
|
|
|
|
/* scalar version faster (no DTCM bank conflict)*/
|
|
" wls lr, %[iWidth], 1f \n"
|
|
".p2align 2 \n"
|
|
"2: \n"
|
|
" ldrd r2, r3, [r0], #8 \n"
|
|
" strd r2, r3, [r1], #8 \n"
|
|
" le lr, 2b \n"
|
|
"1: \n"
|
|
// tail
|
|
" wls lr, %[iWidthTail], 1f \n"
|
|
".p2align 2 \n"
|
|
"2: \n"
|
|
" ldr r2, [r0], #4 \n"
|
|
" str r2, [r1], #4 \n"
|
|
" le lr, 2b \n"
|
|
"1: \n"
|
|
|
|
" add %[pwSource], %[iSourceStride] \n"
|
|
" add %[pwTarget], %[iTargetStride] \n"
|
|
" subs %[iHeight], #1 \n"
|
|
" bne 3b \n"
|
|
|
|
: [pwTarget] "+r"(pwTarget), [pwSource] "+r"(pwSource)
|
|
: [iHeight] "r"(ptCopySize->iHeight), [iWidth] "r" (ptCopySize->iWidth/2),
|
|
[iWidthTail] "r" (ptCopySize->iWidth & 1),
|
|
[iSourceStride] "r" (iSourceStride*sizeof(uint32_t)),
|
|
[iTargetStride] "r" (iTargetStride*sizeof(uint32_t))
|
|
: "r0", "r1", "r2", "r3", "q0", "memory", "r14", "cc"
|
|
);
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* generic column major 32-bit 2D copy
|
|
*/
|
|
__asm volatile(
|
|
"3: \n"
|
|
|
|
" mov r0, %[pwSource] \n"
|
|
" mov r1, %[pwTarget] \n"
|
|
|
|
" wlstp.32 lr, %[iWidth], 1f \n"
|
|
".p2align 2 \n"
|
|
"2: \n"
|
|
" vldrw.32 q0, [r0], #16 \n"
|
|
" vstrw.32 q0, [r1], #16 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
" add %[pwSource], %[iSourceStride] \n"
|
|
" add %[pwTarget], %[iTargetStride] \n"
|
|
" subs %[iHeight], #1 \n"
|
|
" bne 3b \n"
|
|
|
|
: [pwTarget] "+r"(pwTarget), [pwSource] "+r"(pwSource)
|
|
: [iHeight] "r"(ptCopySize->iHeight), [iWidth] "r" (ptCopySize->iWidth),
|
|
[iSourceStride] "r" (iSourceStride*sizeof(uint32_t)),
|
|
[iTargetStride] "r" (iTargetStride*sizeof(uint32_t))
|
|
: "r0", "r1", "q0", "memory", "r14", "cc");
|
|
}
|
|
}
|
|
|
|
/*----------------------------------------------------------------------------*
|
|
* alpha blending *
|
|
*----------------------------------------------------------------------------*/
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_gray8_alpha_blending(uint8_t * __RESTRICT pSourceBase,
|
|
int16_t iSourceStride,
|
|
uint8_t * __RESTRICT pTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t * __RESTRICT ptCopySize,
|
|
uint_fast8_t chRatio)
|
|
{
|
|
int_fast16_t iHeight = ptCopySize->iHeight;
|
|
int_fast16_t iWidth = ptCopySize->iWidth;
|
|
uint16_t hwRatioCompl = 256 - chRatio;
|
|
|
|
|
|
for (int_fast16_t y = 0; y < iHeight; y++) {
|
|
|
|
const uint8_t *pSource = pSourceBase;
|
|
uint8_t *pTarget = pTargetBase;
|
|
int32_t blkCnt = iWidth;
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
do {
|
|
mve_pred16_t tailPred = vctp16q(blkCnt);
|
|
uint16x8_t vecTgt = vldrbq_z_u16(pTarget, tailPred);
|
|
uint16x8_t vecSrc = vldrbq_z_u16(pSource, tailPred);
|
|
|
|
vecTgt = vmulq_x(vecTgt, hwRatioCompl, tailPred);
|
|
vecTgt = vmlaq_m(vecTgt, vecSrc, chRatio, tailPred);
|
|
vecTgt = vecTgt >> 8;
|
|
|
|
vstrbq_p_u16(pTarget , vecTgt , tailPred);
|
|
|
|
pSource += 8;
|
|
pTarget += 8;
|
|
blkCnt -= 8;
|
|
}
|
|
while (blkCnt > 0);
|
|
|
|
#else
|
|
__asm volatile(
|
|
" vldrb.u16 q0, [%[pTarget]] \n"
|
|
".p2align 2 \n"
|
|
" wls lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
" vmul.u16 q0, q0, %[hwRatioCompl] \n"
|
|
" vldrb.u16 q1, [%[pSource]], #8 \n"
|
|
" vmla.u16 q0, q1, %[chRatio] \n"
|
|
" vldrb.u16 q2, [%[pTarget], #8] \n"
|
|
" vshr.u16 q0, q0, #8 \n"
|
|
" vstrb.u16 q0, [%[pTarget]], #8 \n"
|
|
" vmul.u16 q2, q2, %[hwRatioCompl] \n"
|
|
" vldrb.u16 q1, [%[pSource]], #8 \n"
|
|
" vmla.u16 q2, q1, %[chRatio] \n"
|
|
" vldrb.u16 q0, [%[pTarget], #8] \n"
|
|
" vshr.u16 q2, q2, #8 \n"
|
|
" vstrb.u16 q2, [%[pTarget]], #8 \n"
|
|
" le lr, 2b \n"
|
|
"1: \n"
|
|
/* tail */
|
|
" wlstp.16 lr, %[tail], 1f \n"
|
|
"2: \n"
|
|
" vmul.u16 q0, q0, %[hwRatioCompl] \n"
|
|
" vldrb.u16 q1, [%[pSource]], #8 \n"
|
|
" vmla.u16 q0, q1, %[chRatio] \n"
|
|
" vshr.u16 q1, q0, #8 \n"
|
|
" vldrb.u16 q0, [%[pTarget], #8] \n"
|
|
" vstrb.u16 q1, [%[pTarget]], #8 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [pTarget] "+r"(pTarget), [pSource] "+r" (pSource)
|
|
: [chRatio] "r" (chRatio), [hwRatioCompl] "r" (hwRatioCompl),
|
|
[loopCnt] "r"(blkCnt/16), [tail] "r"(blkCnt & 0xf)
|
|
:"q0", "q1", "q2", "memory", "r14");
|
|
#endif
|
|
pSourceBase += (iSourceStride);
|
|
pTargetBase += (iTargetStride);
|
|
}
|
|
}
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_gray8_alpha_blending_colour_keying(uint8_t * __RESTRICT pSourceBase,
|
|
int16_t iSourceStride,
|
|
uint8_t * __RESTRICT pTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *
|
|
__RESTRICT ptCopySize,
|
|
uint_fast8_t chRatio,
|
|
uint8_t Colour)
|
|
{
|
|
int_fast16_t iHeight = ptCopySize->iHeight;
|
|
int_fast16_t iWidth = ptCopySize->iWidth;
|
|
uint16_t hwRatioCompl = 256 - chRatio;
|
|
|
|
|
|
for (int_fast16_t y = 0; y < iHeight; y++) {
|
|
|
|
const uint8_t *pSource = pSourceBase;
|
|
uint8_t *pTarget = pTargetBase;
|
|
int32_t blkCnt = iWidth;
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
do {
|
|
mve_pred16_t tailPred = vctp16q(blkCnt);
|
|
uint16x8_t vecTgt = vldrbq_z_u16(pTarget, tailPred);
|
|
uint16x8_t vecSrc = vldrbq_z_u16(pSource, tailPred);
|
|
|
|
vecTgt = vmulq_x(vecTgt, hwRatioCompl, tailPred);
|
|
vecTgt = vmlaq_m(vecTgt, vecSrc, chRatio, tailPred);
|
|
vecTgt = vecTgt >> 8;
|
|
|
|
vstrbq_p_u16(pTarget , vecTgt ,
|
|
vcmpneq_m_n_u16(vecSrc, (uint16_t)Colour, tailPred));
|
|
|
|
pSource += 8;
|
|
pTarget += 8;
|
|
blkCnt -= 8;
|
|
}
|
|
while (blkCnt > 0);
|
|
|
|
#else
|
|
__asm volatile(
|
|
" vldrb.u16 q0, [%[pTarget]] \n"
|
|
".p2align 2 \n"
|
|
" wls lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
" vmul.u16 q0, q0, %[hwRatioCompl] \n"
|
|
" vldrb.u16 q1, [%[pSource]], #8 \n"
|
|
" vmla.u16 q0, q1, %[chRatio] \n"
|
|
" vldrb.u16 q2, [%[pTarget], #8] \n"
|
|
" vshr.u16 q0, q0, #8 \n"
|
|
" vpt.u16 ne, q1, %[Colour] \n"
|
|
" vstrbt.u16 q0, [%[pTarget]], #8 \n"
|
|
" vmul.u16 q2, q2, %[hwRatioCompl] \n"
|
|
" vldrb.u16 q1, [%[pSource]], #8 \n"
|
|
" vmla.u16 q2, q1, %[chRatio] \n"
|
|
" vldrb.u16 q0, [%[pTarget], #8] \n"
|
|
" vshr.u16 q2, q2, #8 \n"
|
|
" vpt.u16 ne, q1, %[Colour] \n"
|
|
" vstrbt.u16 q2, [%[pTarget]], #8 \n"
|
|
" le lr, 2b \n"
|
|
"1: \n"
|
|
/* tail */
|
|
" wlstp.16 lr, %[tail], 1f \n"
|
|
"2: \n"
|
|
" vmul.u16 q0, q0, %[hwRatioCompl] \n"
|
|
" vldrb.u16 q1, [%[pSource]], #8 \n"
|
|
" vmla.u16 q0, q1, %[chRatio] \n"
|
|
" vshr.u16 q2, q0, #8 \n"
|
|
" vldrb.u16 q0, [%[pTarget], #8] \n"
|
|
" vpt.u16 ne, q1, %[Colour] \n"
|
|
" vstrbt.u16 q2, [%[pTarget]], #8 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [pTarget] "+r"(pTarget), [pSource] "+r" (pSource)
|
|
: [chRatio] "r" (chRatio), [hwRatioCompl] "r" (hwRatioCompl),
|
|
[loopCnt] "r"(blkCnt/16), [tail] "r"(blkCnt & 0xf),
|
|
[Colour] "r" (Colour)
|
|
:"q0", "q1", "q2", "memory", "r14");
|
|
#endif
|
|
pSourceBase += (iSourceStride);
|
|
pTargetBase += (iTargetStride);
|
|
}
|
|
}
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_gray8_colour_filling_with_opacity(uint8_t * __restrict pTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *
|
|
__restrict ptCopySize,
|
|
uint8_t Colour, uint_fast8_t chRatio)
|
|
{
|
|
int_fast16_t iHeight = ptCopySize->iHeight;
|
|
int_fast16_t iWidth = ptCopySize->iWidth;
|
|
uint16_t hwRatioCompl = 256 - chRatio;
|
|
uint16x8_t vecSrc = vdupq_n_u16(Colour);
|
|
|
|
for (int_fast16_t y = 0; y < iHeight; y++) {
|
|
|
|
uint8_t *pTarget = pTargetBase;
|
|
int32_t blkCnt = iWidth;
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
do {
|
|
mve_pred16_t tailPred = vctp16q(blkCnt);
|
|
uint16x8_t vecTgt = vldrbq_z_u16(pTarget, tailPred);
|
|
|
|
vecTgt = vmulq_x(vecTgt, hwRatioCompl, tailPred);
|
|
vecTgt = vmlaq_m(vecTgt, vecSrc, chRatio, tailPred);
|
|
vecTgt = vecTgt >> 8;
|
|
|
|
vstrbq_p_u16(pTarget , vecTgt , tailPred);
|
|
|
|
pTarget += 8;
|
|
blkCnt -= 8;
|
|
}
|
|
while (blkCnt > 0);
|
|
|
|
#else
|
|
__asm volatile(
|
|
" vldrb.u16 q0, [%[pTarget]] \n"
|
|
" vmul.u16 q0, q0, %[hwRatioCompl] \n"
|
|
".p2align 2 \n"
|
|
" wls lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
|
|
" vmla.u16 q0, %[vecSrc], %[chRatio] \n"
|
|
" vldrb.u16 q2, [%[pTarget], #8] \n"
|
|
" vshr.u16 q0, q0, #8 \n"
|
|
" vmul.u16 q2, q2, %[hwRatioCompl] \n"
|
|
" vstrb.u16 q0, [%[pTarget]], #8 \n"
|
|
" vmla.u16 q2, %[vecSrc], %[chRatio] \n"
|
|
" vldrb.u16 q0, [%[pTarget], #8] \n"
|
|
" vshr.u16 q2, q2, #8 \n"
|
|
" vmul.u16 q0, q0, %[hwRatioCompl] \n"
|
|
" vstrb.u16 q2, [%[pTarget]], #8 \n"
|
|
" le lr, 2b \n"
|
|
"1: \n"
|
|
/* tail */
|
|
" wlstp.16 lr, %[tail], 1f \n"
|
|
"2: \n"
|
|
|
|
" vmla.u16 q0, %[vecSrc], %[chRatio] \n"
|
|
" vshr.u16 q2, q0, #8 \n"
|
|
" vldrb.u16 q0, [%[pTarget], #8] \n"
|
|
" vmul.u16 q0, q0, %[hwRatioCompl] \n"
|
|
" vstrb.u16 q2, [%[pTarget]], #8 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [pTarget] "+r"(pTarget)
|
|
: [chRatio] "r" (chRatio), [hwRatioCompl] "r" (hwRatioCompl),
|
|
[loopCnt] "r"(blkCnt/16), [tail] "r"(blkCnt & 0xf),
|
|
[vecSrc] "t" (vecSrc)
|
|
:"q0", "q2", "memory", "r14");
|
|
#endif
|
|
pTargetBase += (iTargetStride);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_rgb565_alpha_blending( uint16_t *phwSourceBase,
|
|
int16_t iSourceStride,
|
|
uint16_t *phwTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *ptCopySize,
|
|
uint_fast8_t chRatio)
|
|
{
|
|
#ifdef USE_MVE_INTRINSICS
|
|
int32_t blkCnt;
|
|
uint16_t ratio1x8 = chRatio * 8;
|
|
uint16_t ratio1x4 = chRatio * 4;
|
|
uint16_t ratio2x8 = (256 - chRatio) * 8;
|
|
uint16_t ratio2x4 = (256 - chRatio) * 4;
|
|
|
|
uint16x8_t vecMaskR = vdupq_n_u16(0x001f);
|
|
uint16x8_t vecMaskG = vdupq_n_u16(0x003f);
|
|
uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8);
|
|
uint16x8_t vecMaskGpck = vdupq_n_u16(0x00fc);
|
|
|
|
|
|
for (int32_t y = 0; y < ptCopySize->iHeight; y++) {
|
|
const uint16_t *phwSource = phwSourceBase;
|
|
uint16_t *phwTarget = phwTargetBase;
|
|
|
|
blkCnt = ptCopySize->iWidth;
|
|
do {
|
|
|
|
uint16x8_t vecIn;
|
|
uint16x8_t vecR0, vecB0, vecG0;
|
|
uint16x8_t vecR1, vecB1, vecG1;
|
|
|
|
/* unpack 1st stream */
|
|
vecIn = vld1q(phwSource);
|
|
vecR0 = vecIn & vecMaskR;
|
|
|
|
vecB0 = vecIn >> 11;
|
|
|
|
vecG0 = vecIn >> 5;
|
|
vecG0 = vecG0 & vecMaskG;
|
|
|
|
|
|
/* unpack 2nd stream */
|
|
vecIn = vld1q(phwTarget);
|
|
vecR1 = vecIn & vecMaskR;
|
|
|
|
vecB1 = vecIn >> 11;
|
|
|
|
vecG1 = vecIn >> 5;
|
|
vecG1 = vecG1 & vecMaskG;
|
|
|
|
|
|
/* merge */
|
|
vecR0 = vecR0 * ratio1x8 + vecR1 * ratio2x8;
|
|
vecR0 = vecR0 >> 8;
|
|
|
|
vecG0 = vecG0 * ratio1x4 + vecG1 * ratio2x4;
|
|
vecG0 = vecG0 >> 8;
|
|
|
|
vecB0 = vecB0 * ratio1x8 + vecB1 * ratio2x8;
|
|
vecB0 = vecB0 >> 8;
|
|
|
|
|
|
/* pack */
|
|
uint16x8_t vOut = vecR0 >> 3 | vmulq((vecG0 & vecMaskGpck), 8)
|
|
| vmulq((vecB0 & vecMaskBpck), 256);
|
|
|
|
|
|
vst1q(phwTarget, vOut);
|
|
|
|
phwSource += 8;
|
|
phwTarget += 8;
|
|
blkCnt -= 8;
|
|
}
|
|
while (blkCnt > 0);
|
|
|
|
phwSourceBase += iSourceStride;
|
|
phwTargetBase += iTargetStride;
|
|
}
|
|
|
|
#else /* USE_MVE_INTRINSICS */
|
|
|
|
|
|
uint16_t ratio1x8 = chRatio * 8;
|
|
uint16_t ratio1x4 = chRatio * 4;
|
|
uint16_t ratio2x8 = (256 - chRatio) * 8;
|
|
uint16_t ratio2x4 = (256 - chRatio) * 4;
|
|
uint16x8_t vecMaskR = vdupq_n_u16(0x001f);
|
|
uint16x8_t vecMaskG = vdupq_n_u16(0x003f);
|
|
uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8);
|
|
uint32_t iWidth = ptCopySize->iWidth;
|
|
int32_t row = ptCopySize->iHeight;
|
|
uint16x8_t scratch[1];
|
|
|
|
vst1q((uint16_t *)scratch, vdupq_n_u16(0x00fc));
|
|
|
|
do {
|
|
const uint16_t *pSource = phwSourceBase;
|
|
uint16_t *pTarget = phwTargetBase;
|
|
register unsigned loopCnt __asm("lr");
|
|
loopCnt = iWidth;
|
|
|
|
__asm volatile(
|
|
".p2align 2 \n"
|
|
" vldrh.u16 q4, [%[pTarget]] \n"
|
|
" vldrh.u16 q5, [%[pSource]], #16 \n"
|
|
|
|
" wlstp.16 lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
// B target extraction
|
|
// right shift by 5 (x 1/32) for M55 friendly
|
|
// IV / Mul pipe interleaving
|
|
" vqdmulh.s16 q2, q4, %[rshft5] \n"
|
|
" vand q7, q4, %[vecMaskR] \n"
|
|
|
|
" vmul.i16 q6, q7, %[ratio2x8] \n"
|
|
// B source extraction
|
|
" vand q7, q5, %[vecMaskR] \n"
|
|
// B mix
|
|
" vmla.u16 q6, q7, %[ratio1x8] \n"
|
|
// G extraction
|
|
" vand q2, q2, %[vecMaskG] \n"
|
|
" vshr.u16 q7, q5, #5 \n"
|
|
" vmul.i16 q2, q2, %[ratio2x4] \n"
|
|
// G extraction
|
|
" vand q7, q7, %[vecMaskG] \n"
|
|
// G mix
|
|
" vmla.u16 q2, q7, %[ratio1x4] \n"
|
|
// R extraction
|
|
" vshr.u16 q4, q4, #11 \n"
|
|
" vmul.i16 q7, q4, %[ratio2x8] \n"
|
|
// R extraction
|
|
" vshr.u16 q5, q5, #11 \n"
|
|
// R mix
|
|
" vmla.u16 q7, q5, %[ratio1x8] \n"
|
|
|
|
" vshr.u16 q2, q2, #8 \n"
|
|
" vldrh.16 q5, [%[scratch]] \n"
|
|
|
|
" vand q2, q2, q5 \n"
|
|
// vmulq((vecG0 & 0x00fc), 8)
|
|
" vmul.i16 q2, q2, %[eight] \n"
|
|
" vshr.u16 q4, q7, #8 \n"
|
|
// schedule next source load
|
|
" vldrh.u16 q5, [%[pSource]], #16 \n"
|
|
" vand q7, q4, %[vecMaskBpck] \n"
|
|
// pack R & G
|
|
// vmulq((vecG0 & vecMaskGpck), 8) + vmulq((vecR0 & vecMaskRpck), 256)
|
|
" vmla.u16 q2, q7, %[twofiftysix] \n"
|
|
// downshift B ((vecB0 >> 8) >> 3)
|
|
" vshr.u16 q7, q6, #11 \n"
|
|
// schedule next target load (pre offset as target not imcrementred so far)
|
|
" vldrh.u16 q4, [%[pTarget], #16] \n"
|
|
// pack blue with R&G
|
|
" vorr q2, q2, q7 \n"
|
|
|
|
" vstrh.16 q2, [%[pTarget]], #16 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [pSource] "+r"(pSource), [pTarget] "+r" (pTarget), [loopCnt] "+r"(loopCnt)
|
|
: [vecMaskR] "t" (vecMaskR), [vecMaskG] "t" (vecMaskG),
|
|
[vecMaskBpck] "t" (vecMaskBpck),
|
|
[ratio1x8] "r" (ratio1x8), [ratio2x8] "r" (ratio2x8),
|
|
[ratio1x4] "r" (ratio1x4), [ratio2x4] "r" (ratio2x4),
|
|
[eight] "r" (8), [twofiftysix] "r" (256), [rshft5] "r" (1024), [scratch] "r" (scratch)
|
|
: "q2", "q4", "q5", "q6", "q7", "memory" );
|
|
|
|
phwSourceBase += iSourceStride;
|
|
phwTargetBase += iTargetStride;
|
|
} while (--row);
|
|
#endif /* USE_MVE_INTRINSICS */
|
|
}
|
|
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_rgb565_colour_filling_with_opacity(
|
|
uint16_t *__RESTRICT pTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *__RESTRICT ptCopySize,
|
|
uint16_t Colour,
|
|
uint_fast8_t chRatio)
|
|
{
|
|
#ifdef USE_MVE_INTRINSICS
|
|
int32_t blkCnt;
|
|
uint16_t ratio1x8 = chRatio * 8;
|
|
uint16_t ratio1x4 = chRatio * 4;
|
|
uint16_t ratio2x8 = (256 - chRatio) * 8;
|
|
uint16_t ratio2x4 = (256 - chRatio) * 4;
|
|
|
|
uint16x8_t vecMaskR = vdupq_n_u16(0x001f);
|
|
uint16x8_t vecMaskG = vdupq_n_u16(0x003f);
|
|
uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8);
|
|
uint16x8_t vecMaskGpck = vdupq_n_u16(0x00fc);
|
|
uint16x8_t vecIn;
|
|
uint16x8_t vecColorR, vecColorB, vecColorG;
|
|
|
|
/* unpack color & scale */
|
|
vecIn = vdupq_n_u16(Colour);
|
|
vecColorR = (vecIn & vecMaskR) * ratio1x8;
|
|
|
|
vecColorB = (vecIn >> 11) * ratio1x8;
|
|
|
|
vecColorG = vecIn >> 5;
|
|
vecColorG = (vecColorG & vecMaskG) * ratio1x4;
|
|
|
|
for (int32_t y = 0; y < ptCopySize->iHeight; y++) {
|
|
uint16_t *phwTarget = pTargetBase;
|
|
|
|
blkCnt = ptCopySize->iWidth;
|
|
do {
|
|
uint16x8_t vecR0, vecB0, vecG0;
|
|
uint16x8_t vecR1, vecB1, vecG1;
|
|
|
|
/* unpack stream */
|
|
vecIn = vld1q(phwTarget);
|
|
vecR1 = vecIn & vecMaskR;
|
|
|
|
vecB1 = vecIn >> 11;
|
|
|
|
vecG1 = vecIn >> 5;
|
|
vecG1 = vecG1 & vecMaskG;
|
|
|
|
|
|
/* merge */
|
|
vecR0 = vecColorR + vecR1 * ratio2x8;
|
|
vecR0 = vecR0 >> 8;
|
|
|
|
vecG0 = vecColorG + vecG1 * ratio2x4;
|
|
vecG0 = vecG0 >> 8;
|
|
|
|
vecB0 = vecColorB + vecB1 * ratio2x8;
|
|
vecB0 = vecB0 >> 8;
|
|
|
|
/* pack */
|
|
uint16x8_t vOut = vecR0 >> 3 | vmulq((vecG0 & vecMaskGpck), 8)
|
|
| vmulq((vecB0 & vecMaskBpck), 256);
|
|
|
|
vst1q(phwTarget, vOut);
|
|
|
|
phwTarget += 8;
|
|
blkCnt -= 8;
|
|
}
|
|
while (blkCnt > 0);
|
|
|
|
pTargetBase += iTargetStride;
|
|
}
|
|
|
|
#else /* USE_MVE_INTRINSICS */
|
|
|
|
uint16_t ratio1x8 = chRatio * 8;
|
|
uint16_t ratio1x4 = chRatio * 4;
|
|
uint16_t ratio2x8 = (256 - chRatio) * 8;
|
|
uint16_t ratio2x4 = (256 - chRatio) * 4;
|
|
uint16x8_t vecMaskR = vdupq_n_u16(0x001f);
|
|
uint16x8_t vecMaskG = vdupq_n_u16(0x003f);
|
|
uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8);
|
|
uint16x8_t vecColorR, vecColorB, vecColorG;
|
|
uint16x8_t scratch[4];
|
|
|
|
/* unpack color */
|
|
uint16x8_t vecIn = vdupq_n_u16(Colour);
|
|
vecColorR = vecIn & vecMaskR;
|
|
vecColorB = vecIn >> 11;
|
|
vecColorG = vecIn >> 5;
|
|
vecColorG = vecColorG & vecMaskG;
|
|
vst1q((uint16_t*)scratch, vecColorR * ratio1x8);
|
|
vst1q((uint16_t*)&scratch[1], vecColorB * ratio1x8);
|
|
vst1q((uint16_t*)&scratch[2], vecColorG * ratio1x4);
|
|
vst1q((uint16_t*)&scratch[3], vdupq_n_u16(0x00fc));
|
|
|
|
int32_t row = ptCopySize->iHeight;
|
|
do {
|
|
|
|
uint16_t *phwTarget = pTargetBase;
|
|
register unsigned loopCnt __asm("lr");
|
|
loopCnt = ptCopySize->iWidth;
|
|
|
|
__asm volatile(
|
|
" vldrh.u16 q4, [%[phwTarget]] \n"
|
|
|
|
" wlstp.16 lr, %[loopCnt], 1f \n"
|
|
".p2align 2 \n"
|
|
"2: \n"
|
|
// B target extraction
|
|
" vand q7, q4, %[vecMaskR] \n"
|
|
" vldrh.u16 q6, [%[scratch]] \n"
|
|
" vshr.u16 q2, q4, #5 \n"
|
|
|
|
// B mix
|
|
" vmla.u16 q6, q7, %[ratio2x8] \n"
|
|
// G extraction
|
|
" vand q7, q2, %[vecMaskG] \n"
|
|
|
|
// G extraction
|
|
" vldrh.u16 q2, [%[scratch], #32] \n"
|
|
// G mix
|
|
" vmla.u16 q2, q7, %[ratio2x4] \n"
|
|
|
|
" vshr.u16 q4, q4, #11 \n"
|
|
// R extraction
|
|
" vldrh.u16 q7, [%[scratch], #16] \n"
|
|
" vshr.u16 q2, q2, #8 \n"
|
|
// R mix
|
|
" vmla.u16 q7, q4, %[ratio2x8] \n"
|
|
" vshr.u16 q4, q7, #8 \n"
|
|
|
|
// load duplicated 0xfc mask
|
|
" vldrh.u16 q7, [%[scratch], #48] \n"
|
|
" vand q2, q2, q7 \n"
|
|
|
|
" vmul.i16 q2, q2, %[eight] \n"
|
|
" vand q7, q4, %[vecMaskBpck] \n"
|
|
|
|
// pack R & G
|
|
" vmla.u16 q2, q7, %[twofiftysix] \n"
|
|
// downshift B ((vecB0 >> 8) >> 3)
|
|
" vshr.u16 q7, q6, #11 \n"
|
|
// schedule next target load
|
|
" vldrh.u16 q4, [%[phwTarget], #16] \n"
|
|
// pack blue with R&G
|
|
" vorr q2, q2, q7 \n"
|
|
" vstrh.16 q2, [%[phwTarget]], #16 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
: [phwTarget] "+r" (phwTarget), [loopCnt] "+r"(loopCnt)
|
|
: [vecMaskR] "t" (vecMaskR), [vecMaskG] "t" (vecMaskG),
|
|
[vecMaskBpck] "t" (vecMaskBpck),
|
|
[ratio2x8] "r" (ratio2x8), [ratio2x4] "r" (ratio2x4),
|
|
[eight] "r" (8), [twofiftysix] "r" (256), [scratch] "r" (scratch)
|
|
: "q2", "q4", "q5", "q6", "q7", "memory" );
|
|
|
|
pTargetBase += iTargetStride;
|
|
} while (--row);
|
|
|
|
#endif /* USE_MVE_INTRINSICS */
|
|
|
|
}
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_rgb565_alpha_blending_colour_keying(
|
|
uint16_t * __RESTRICT phwSource,
|
|
int16_t iSourceStride,
|
|
uint16_t * __RESTRICT phwTarget,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t * __RESTRICT ptCopySize,
|
|
uint_fast8_t chRatio,
|
|
uint32_t hwColour)
|
|
{
|
|
#ifdef USE_MVE_INTRINSICS
|
|
uint32_t iHeight = ptCopySize->iHeight;
|
|
uint32_t iWidth = ptCopySize->iWidth;
|
|
|
|
int32_t blkCnt;
|
|
uint16_t ratio1x8 = chRatio * 8;
|
|
uint16_t ratio1x4 = chRatio * 4;
|
|
uint16_t ratio2x8 = (256 - chRatio) * 8;
|
|
uint16_t ratio2x4 = (256 - chRatio) * 4;
|
|
|
|
uint16x8_t vecMaskR = vdupq_n_u16(0x001f);
|
|
uint16x8_t vecMaskG = vdupq_n_u16(0x003f);
|
|
uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8);
|
|
uint16x8_t vecMaskGpck = vdupq_n_u16(0x00fc);
|
|
|
|
for (uint32_t y = 0; y < iHeight; y++) {
|
|
// - inconditional blending + predicated dst update
|
|
const uint16_t *pSource = phwSource;
|
|
uint16_t *pTarget = phwTarget;
|
|
blkCnt = iWidth >> 3;
|
|
|
|
while (blkCnt > 0) {
|
|
uint16x8_t vecInSrc, vecInDst;
|
|
uint16x8_t vecR0, vecB0, vecG0;
|
|
uint16x8_t vecR1, vecB1, vecG1;
|
|
|
|
/* unpack 1st stream */
|
|
vecInSrc = vld1q(pSource);
|
|
vecR0 = vandq(vecInSrc, vecMaskR);
|
|
vecB0 = vshrq(vecInSrc, 11);
|
|
vecG0 = vshrq(vecInSrc, 5);
|
|
vecG0 = vandq(vecG0, vecMaskG);
|
|
|
|
/* unpack 2nd stream */
|
|
vecInDst = vld1q(pTarget);
|
|
vecR1 = vandq(vecInDst, vecMaskR);
|
|
vecB1 = vshrq(vecInDst, 11);
|
|
vecG1 = vshrq(vecInDst, 5);
|
|
vecG1 = vandq(vecG1, vecMaskG);
|
|
|
|
/* merge */
|
|
vecR0 = vmlaq(vmulq(vecR0, ratio1x8), vecR1, ratio2x8);
|
|
vecR0 = vshrq(vecR0, 8);
|
|
vecG0 = vmlaq(vmulq(vecG0, ratio1x4), vecG1, ratio2x4);
|
|
vecG0 = vshrq(vecG0, 8);
|
|
vecB0 = vmlaq(vmulq(vecB0, ratio1x8), vecB1, ratio2x8);
|
|
vecB0 = vshrq(vecB0, 8);
|
|
|
|
/* pack */
|
|
uint16x8_t vOut = vorrq(vshrq(vecR0, 3),
|
|
vmulq(vandq(vecG0, vecMaskGpck), 8));
|
|
|
|
vOut = vorrq(vOut, vmulq(vandq(vecB0, vecMaskBpck), 256));
|
|
|
|
vst1q_p(pTarget, vOut, vcmpneq_n_s16(vecInSrc, hwColour));
|
|
|
|
pSource += 8;
|
|
pTarget += 8;
|
|
blkCnt--;
|
|
|
|
}
|
|
|
|
blkCnt = iWidth & 7;
|
|
if (blkCnt > 0U) {
|
|
uint16x8_t vecInSrc, vecInDst;
|
|
uint16x8_t vecR0, vecB0, vecG0;
|
|
uint16x8_t vecR1, vecB1, vecG1;
|
|
|
|
/* unpack 1st stream */
|
|
vecInSrc = vld1q(pSource);
|
|
vecR0 = vandq(vecInSrc, vecMaskR);
|
|
vecB0 = vshrq(vecInSrc, 11);
|
|
vecG0 = vshrq(vecInSrc, 5);
|
|
vecG0 = vandq(vecG0, vecMaskG);
|
|
|
|
/* unpack 2nd stream */
|
|
vecInDst = vld1q(pTarget);
|
|
vecR1 = vandq(vecInDst, vecMaskR);
|
|
vecB1 = vshrq(vecInDst, 11);
|
|
vecG1 = vshrq(vecInDst, 5);
|
|
vecG1 = vandq(vecG1, vecMaskG);
|
|
|
|
/* merge */
|
|
vecR0 = vmlaq(vmulq(vecR0, ratio1x8), vecR1, ratio2x8);
|
|
vecR0 = vshrq(vecR0, 8);
|
|
vecG0 = vmlaq(vmulq(vecG0, ratio1x4), vecG1, ratio2x4);
|
|
vecG0 = vshrq(vecG0, 8);
|
|
vecB0 = vmlaq(vmulq(vecB0, ratio1x8), vecB1, ratio2x8);
|
|
vecB0 = vshrq(vecB0, 8);
|
|
|
|
/* pack */
|
|
uint16x8_t vOut = vorrq(vshrq(vecR0, 3),
|
|
vmulq(vandq(vecG0, vecMaskGpck), 8));
|
|
|
|
vOut = vorrq(vOut,
|
|
vmulq(vandq(vecB0, vecMaskBpck), 256));
|
|
|
|
vst1q_p(pTarget, vOut,
|
|
vcmpneq_m_n_s16(vecInSrc, hwColour, vctp16q(blkCnt)));
|
|
|
|
}
|
|
|
|
phwSource += iSourceStride;
|
|
phwTarget += iTargetStride;
|
|
}
|
|
#else
|
|
uint32_t iHeight = ptCopySize->iHeight;
|
|
uint32_t iWidth = ptCopySize->iWidth;
|
|
|
|
uint16_t ratio1x8 = chRatio * 8;
|
|
uint16_t ratio1x4 = chRatio * 4;
|
|
uint16_t ratio2x8 = (256 - chRatio) * 8;
|
|
uint16_t ratio2x4 = (256 - chRatio) * 4;
|
|
|
|
uint16x8_t vecMaskR = vdupq_n_u16(0x001f);
|
|
uint16x8_t vecMaskG = vdupq_n_u16(0x003f);
|
|
uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8);
|
|
uint16x8_t scratch[1];
|
|
|
|
vst1q((uint16_t *)scratch, vdupq_n_u16(0x00fc));
|
|
|
|
for (uint32_t y = 0; y < iHeight; y++) {
|
|
|
|
const uint16_t *pSource = phwSource;
|
|
uint16_t *pTarget = phwTarget;
|
|
register unsigned loopCnt __asm("lr");
|
|
loopCnt = iWidth;
|
|
|
|
__asm volatile(
|
|
".p2align 2 \n"
|
|
" vldrh.u16 q4, [%[pTarget]] \n"
|
|
" vldrh.u16 q5, [%[pSource]], #16 \n"
|
|
" vand q7, q4, %[vecMaskR] \n"
|
|
" wlstp.16 lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
// B target extraction
|
|
" vshr.u16 q2, q4, #5 \n"
|
|
" vmul.i16 q6, q7, %[ratio2x8] \n"
|
|
// B source extraction
|
|
" vand q7, q5, %[vecMaskR] \n"
|
|
// B mix
|
|
" vmla.u16 q6, q7, %[ratio1x8] \n"
|
|
// G extraction
|
|
" vand q2, q2, %[vecMaskG] \n"
|
|
" vshr.u16 q7, q5, #5 \n"
|
|
" vmul.i16 q2, q2, %[ratio2x4] \n"
|
|
// G extraction
|
|
" vand q7, q7, %[vecMaskG] \n"
|
|
// G mix
|
|
" vmla.u16 q2, q7, %[ratio1x4] \n"
|
|
// R extraction
|
|
" vshr.u16 q4, q4, #11 \n"
|
|
" vmul.i16 q7, q4, %[ratio2x8] \n"
|
|
// R extraction
|
|
" vshr.u16 q5, q5, #11 \n"
|
|
// R mix
|
|
" vmla.u16 q7, q5, %[ratio1x8] \n"
|
|
|
|
" vshr.u16 q2, q2, #8 \n"
|
|
" vldrh.16 q5, [%[scratch]] \n"
|
|
|
|
" vand q2, q2, q5 \n"
|
|
// vmulq((vecG0 & 0x00fc), 8)
|
|
" vmul.i16 q2, q2, %[eight] \n"
|
|
" vshr.u16 q4, q7, #8 \n"
|
|
// schedule next source load
|
|
" vldrh.u16 q5, [%[pSource]], #16 \n"
|
|
" vand q7, q4, %[vecMaskBpck] \n"
|
|
// pack R & G
|
|
// vmulq((vecG0 & vecMaskGpck), 8) + vmulq((vecR0 & vecMaskRpck), 256)
|
|
" vmla.u16 q2, q7, %[twofiftysix] \n"
|
|
// downshift B ((vecB0 >> 8) >> 3)
|
|
" vshr.u16 q7, q6, #11 \n"
|
|
// schedule next target load (pre offset as target not imcrementred so far)
|
|
" vldrh.u16 q4, [%[pTarget], #16] \n"
|
|
// pack blue with R&G
|
|
" vorr q2, q2, q7 \n"
|
|
" vldrh.u16 q6, [%[pSource], #-32] \n"
|
|
" vand q7, q4, %[vecMaskR] \n"
|
|
" vpt.u16 ne, q6, %[hwColour] \n"
|
|
" vstrht.16 q2, [%[pTarget]], #16 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [pSource] "+r"(pSource), [pTarget] "+r" (pTarget), [loopCnt] "+r"(loopCnt)
|
|
: [vecMaskR] "t" (vecMaskR), [vecMaskG] "t" (vecMaskG),
|
|
[vecMaskBpck] "t" (vecMaskBpck),
|
|
[ratio1x8] "r" (ratio1x8), [ratio2x8] "r" (ratio2x8),
|
|
[ratio1x4] "r" (ratio1x4), [ratio2x4] "r" (ratio2x4),
|
|
[eight] "r" (8), [twofiftysix] "r" (256), [hwColour] "r" (hwColour), [scratch] "r" (scratch)
|
|
: "q2", "q4", "q5", "q6", "q7", "memory" );
|
|
|
|
phwSource += (iSourceStride);
|
|
phwTarget += (iTargetStride);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_cccn888_alpha_blending( uint32_t *pwSourceBase,
|
|
int16_t iSourceStride,
|
|
uint32_t *pwTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *ptCopySize,
|
|
uint_fast8_t chRatio)
|
|
{
|
|
#ifdef USE_MVE_INTRINSICS
|
|
uint16_t chRatioCompl = 256 - (uint16_t) chRatio;
|
|
int32_t blkCnt;
|
|
int32_t row = ptCopySize->iHeight;
|
|
|
|
while (row > 0) {
|
|
|
|
const uint32_t *pwSource = pwSourceBase;
|
|
uint32_t *pwTarget = pwTargetBase;
|
|
/* byte extraction into 16-bit vector */
|
|
uint16x8_t vecSrc = vldrbq_u16((const uint8_t *)pwSource);
|
|
uint16x8_t vecTrg = vldrbq_u16((const uint8_t *)pwTarget);
|
|
|
|
pwSource += 2;
|
|
blkCnt = ptCopySize->iWidth;
|
|
|
|
while (blkCnt > 0) {
|
|
vstrbq_u16((const uint8_t *)pwTarget,
|
|
vmlaq(vmulq(vecSrc, chRatio), vecTrg, chRatioCompl) >> 8);
|
|
|
|
pwTarget += 2;
|
|
|
|
vecSrc = vldrbq_u16((const uint8_t *)pwSource);
|
|
vecTrg = vldrbq_u16((const uint8_t *)pwTarget);
|
|
pwSource += 2;
|
|
blkCnt -= 2;
|
|
}
|
|
|
|
pwSourceBase += iSourceStride;
|
|
pwTargetBase += iTargetStride;
|
|
row--;
|
|
}
|
|
#else
|
|
uint16_t chRatioCompl = 256 - (uint16_t) chRatio;
|
|
register unsigned blkCnt __asm("lr");
|
|
int32_t row = ptCopySize->iHeight;
|
|
|
|
while(row > 0)
|
|
{
|
|
blkCnt = ptCopySize->iWidth*4;
|
|
const uint32_t *pwSource = pwSourceBase;
|
|
uint32_t *pwTarget = pwTargetBase;
|
|
|
|
__asm volatile(
|
|
" vldrb.u16 q0, [%[pwSource]], #8 \n"
|
|
" vldrb.u16 q1, [%[pwTarget]] \n"
|
|
|
|
" wlstp.16 lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
" vmul.u16 q2, q0, %[chRatio] \n"
|
|
" vldrb.u16 q0, [%[pwSource]], #8 \n"
|
|
" vmla.u16 q2, q1, %[chRatioCompl] \n"
|
|
" vldrb.u16 q1, [%[pwTarget], #8] \n"
|
|
" vshr.u16 q2, q2, #8 \n"
|
|
" vstrb.16 q2, [%[pwTarget]], #8 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [pwSource] "+l"(pwSource), [pwTarget] "+l"(pwTarget),
|
|
[loopCnt] "+r"(blkCnt)
|
|
: [chRatio] "r" (chRatio), [chRatioCompl] "r" (chRatioCompl)
|
|
: "q0", "q1", "q2", "memory" );
|
|
|
|
pwSourceBase += iSourceStride;
|
|
pwTargetBase += iTargetStride;
|
|
row--;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_cccn888_colour_filling_with_opacity(
|
|
uint32_t *__RESTRICT pTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *__RESTRICT ptCopySize,
|
|
uint32_t Colour,
|
|
uint_fast8_t chRatio)
|
|
{
|
|
#ifdef USE_MVE_INTRINSICS
|
|
uint16_t chRatioCompl = 256 - (uint16_t) chRatio;
|
|
int32_t blkCnt;
|
|
int32_t row = ptCopySize->iHeight;
|
|
uint32_t scratch[2];
|
|
uint16x8_t vColor;
|
|
|
|
scratch[0] = scratch[1] = Colour;
|
|
vColor = vldrbq_u16((uint8_t *) scratch);
|
|
vColor = vColor * (uint16_t)chRatio;
|
|
|
|
while (row > 0) {
|
|
uint32_t *pTarget = pTargetBase;
|
|
blkCnt = ptCopySize->iWidth;
|
|
|
|
while (blkCnt > 0) {
|
|
/* byte extraction into 16-bit vector */
|
|
uint16x8_t vecTrg = vldrbq_u16((uint8_t *)pTarget);
|
|
|
|
vstrbq_u16((uint8_t *)pTarget, vmlaq(vColor, vecTrg, chRatioCompl) >> 8);
|
|
|
|
pTarget += 2;
|
|
blkCnt -= 2;
|
|
}
|
|
pTargetBase += iTargetStride;
|
|
row--;
|
|
}
|
|
#else /* USE_MVE_INTRINSICS */
|
|
|
|
uint16_t chRatioCompl = 256 - (uint16_t) chRatio;
|
|
int32_t blkCnt;
|
|
int32_t row = ptCopySize->iHeight;
|
|
uint32_t scratch[2];
|
|
uint16x8_t vColor;
|
|
|
|
scratch[0] = scratch[1] = Colour;
|
|
vColor = vldrbq_u16((uint8_t *) scratch);
|
|
vColor = vColor * (uint16_t)chRatio;
|
|
|
|
while (row > 0) {
|
|
uint32_t *pTarget = pTargetBase;
|
|
blkCnt = ptCopySize->iWidth*4;
|
|
|
|
__asm volatile(
|
|
/* preload */
|
|
" vldrb.u16 q1, [%[pTarget]] \n"
|
|
|
|
" wlstp.16 lr, %[loopCnt], 1f \n"
|
|
".p2align 2 \n"
|
|
"2: \n"
|
|
" vmov q2, %[vColor] \n"
|
|
" vmla.u16 q2, q1, %[chRatioCompl] \n"
|
|
" vldrb.u16 q1, [%[pTarget], #8] \n"
|
|
" vshr.u16 q2, q2, #8 \n"
|
|
" vstrb.16 q2, [%[pTarget]], #8 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
: [pTarget] "+l"(pTarget)
|
|
: [loopCnt] "r"(blkCnt), [chRatioCompl] "r" (chRatioCompl), [vColor] "t" (vColor)
|
|
: "q0", "q1", "q2", "memory" );
|
|
|
|
pTargetBase += iTargetStride;
|
|
row--;
|
|
}
|
|
|
|
#endif /* USE_MVE_INTRINSICS */
|
|
}
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_cccn888_alpha_blending_colour_keying(uint32_t * __RESTRICT pSourceBase,
|
|
int16_t iSourceStride,
|
|
uint32_t * __RESTRICT pTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *
|
|
__RESTRICT ptCopySize,
|
|
uint_fast8_t chRatio,
|
|
uint32_t Colour)
|
|
{
|
|
int_fast16_t iHeight = ptCopySize->iHeight;
|
|
int_fast16_t iWidth = ptCopySize->iWidth;
|
|
uint16_t chRatioCompl = 256 - chRatio;
|
|
|
|
|
|
for (int_fast16_t y = 0; y < iHeight; y++) {
|
|
const uint32_t *pSource = pSourceBase;
|
|
uint32_t *pTarget = pTargetBase;
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
int32_t blkCnt = iWidth;
|
|
|
|
do {
|
|
mve_pred16_t p = vctp32q(blkCnt);
|
|
|
|
uint8x16_t vSrc8 = vld1q_z(pSource, p);
|
|
uint8x16_t vTrg8 = vld1q_z(pTarget, p);
|
|
|
|
/* 16-bit expansion A/G src pixels */
|
|
uint16x8_t vSrc16b = vmovlbq_x(vSrc8, p);
|
|
/* 16-bit expansion R/B src pixels */
|
|
uint16x8_t vSrc16t = vmovltq_x(vSrc8, p);
|
|
|
|
/* 16-bit expansion A/G target pixels */
|
|
uint16x8_t vTrg16b = vmovlbq_x(vTrg8, p);
|
|
/* 16-bit expansion R/B target pixels */
|
|
uint16x8_t vTrg16t = vmovltq_x(vTrg8, p);
|
|
|
|
/* A/G blending */
|
|
int16x8_t vecOutb = vmlaq_m(vmulq_x(vSrc16b, chRatio, p), vTrg16b, chRatioCompl, p);
|
|
/* R/B blending */
|
|
int16x8_t vecOutt = vmlaq_m(vmulq_x(vSrc16t, chRatio, p), vTrg16t, chRatioCompl, p);
|
|
|
|
/* merge into 8-bit vector */
|
|
int8x16_t vecOut8 = vuninitializedq_s8();
|
|
|
|
vecOut8 = vqshrnbq_m_n_s16(vecOut8, vecOutb, 8, p);
|
|
vecOut8 = vqshrntq_m_n_s16(vecOut8, vecOutt, 8, p);
|
|
|
|
// update if (*pSourceBase != Colour)
|
|
vst1q_p_u32(pTarget, (uint32x4_t) vecOut8,
|
|
vcmpneq_m_n_u32((uint32x4_t) vSrc8, Colour, p));
|
|
|
|
pSource += 4;
|
|
pTarget += 4;
|
|
blkCnt -= 4;
|
|
}
|
|
while (blkCnt > 0);
|
|
|
|
#else // USE_MVE_INTRINSICS
|
|
|
|
__asm volatile (
|
|
".p2align 2 \n"
|
|
/* preload uint32x4_t target vector */
|
|
" vldrw.u32 q2, [%[targ]] \n"
|
|
|
|
" wlstp.32 lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
/* 16-bit expansion A/G target pixels */
|
|
" vmovlb.u8 q3, q2 \n"
|
|
" vldrw.u32 q0, [%[src]], #16 \n"
|
|
/* 16-bit expansion A/G source pixels */
|
|
" vmovlb.u8 q1, q0 \n"
|
|
" vmul.i16 q1, q1, %[ratio] \n"
|
|
/* 16-bit expansion R/B target pixels */
|
|
" vmovlt.u8 q2, q2 \n"
|
|
/* A/G blending */
|
|
" vmla.u16 q1, q3, %[ratioCmp] \n"
|
|
/* 16-bit expansion R/B source pixels */
|
|
" vmovlt.u8 q3, q0 \n"
|
|
" vmul.i16 q3, q3, %[ratio] \n"
|
|
/* merge A/G into 8-bit vector */
|
|
" vqshrnb.s16 q1, q1, #8 \n"
|
|
/* R/B blending */
|
|
" vmla.u16 q3, q2, %[ratioCmp] \n"
|
|
/* preload next target */
|
|
" vldrw.u32 q2, [%[targ], #16] \n"
|
|
/* merge R/B into 8-bit vector */
|
|
" vqshrnt.s16 q1, q3, #8 \n"
|
|
/* update if (*pSourceBase != Colour) */
|
|
" vpt.i32 ne, q0, %[color] \n"
|
|
" vstrwt.32 q1, [%[targ]], #16 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
:[targ] "+r" (pTarget), [src] "+r" (pSource)
|
|
:[loopCnt] "r" (iWidth), [ratio] "r" (chRatio),
|
|
[ratioCmp] "r" (chRatioCompl), [color] "r" (Colour)
|
|
:"r14", "q0", "q1", "q2", "q3", "memory");
|
|
#endif
|
|
pSourceBase += (iSourceStride);
|
|
pTargetBase += (iTargetStride);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_rgb565_alpha_blending_direct(const uint16_t *phwSource,
|
|
const uint16_t *phwBackground,
|
|
uint16_t *phwDestination,
|
|
uint32_t wPixelCount,
|
|
uint_fast8_t chRatio)
|
|
{
|
|
#ifdef USE_MVE_INTRINSICS
|
|
int32_t blkCnt;
|
|
uint16_t ratio1x8 = chRatio * 8;
|
|
uint16_t ratio1x4 = chRatio * 4;
|
|
uint16_t ratio2x8 = (256 - chRatio) * 8;
|
|
uint16_t ratio2x4 = (256 - chRatio) * 4;
|
|
|
|
uint16x8_t vecMaskR = vdupq_n_u16(0x001f);
|
|
uint16x8_t vecMaskG = vdupq_n_u16(0x003f);
|
|
uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8);
|
|
uint16x8_t vecMaskGpck = vdupq_n_u16(0x00fc);
|
|
|
|
blkCnt = wPixelCount;
|
|
do {
|
|
|
|
uint16x8_t vecIn;
|
|
uint16x8_t vecR0, vecB0, vecG0;
|
|
uint16x8_t vecR1, vecB1, vecG1;
|
|
|
|
/* unpack 1st stream */
|
|
vecIn = vld1q(phwSource);
|
|
phwSource += 8;
|
|
|
|
vecR0 = vecIn & vecMaskR;
|
|
|
|
vecB0 = vecIn >> 11;
|
|
|
|
vecG0 = vecIn >> 5;
|
|
vecG0 = vecG0 & vecMaskG;
|
|
|
|
|
|
/* unpack 2nd stream */
|
|
vecIn = vld1q(phwBackground);
|
|
phwBackground += 8;
|
|
|
|
vecR1 = vecIn & vecMaskR;
|
|
|
|
vecB1 = vecIn >> 11;
|
|
|
|
vecG1 = vecIn >> 5;
|
|
vecG1 = vecG1 & vecMaskG;
|
|
|
|
|
|
/* merge */
|
|
vecR0 = vecR0 * ratio1x8 + vecR1 * ratio2x8;
|
|
vecR0 = vecR0 >> 8;
|
|
|
|
vecG0 = vecG0 * ratio1x4 + vecG1 * ratio2x4;
|
|
vecG0 = vecG0 >> 8;
|
|
|
|
vecB0 = vecB0 * ratio1x8 + vecB1 * ratio2x8;
|
|
vecB0 = vecB0 >> 8;
|
|
|
|
|
|
/* pack */
|
|
uint16x8_t vOut =
|
|
vecR0 >> 3 | vmulq((vecG0 & vecMaskGpck), 8)
|
|
| vmulq((vecB0 & vecMaskBpck), 256);
|
|
|
|
vst1q(phwDestination, vOut);
|
|
phwDestination += 8;
|
|
|
|
blkCnt -= 8;
|
|
}
|
|
while (blkCnt > 0);
|
|
|
|
#else /* USE_MVE_INTRINSICS */
|
|
|
|
uint16_t ratio1x8 = chRatio * 8;
|
|
uint16_t ratio1x4 = chRatio * 4;
|
|
uint16_t ratio2x8 = (256 - chRatio) * 8;
|
|
uint16_t ratio2x4 = (256 - chRatio) * 4;
|
|
uint16x8_t vecMaskR = vdupq_n_u16(0x001f);
|
|
uint16x8_t vecMaskG = vdupq_n_u16(0x003f);
|
|
uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8);
|
|
|
|
register unsigned loopCnt __asm("lr") = (wPixelCount);
|
|
|
|
__asm volatile(
|
|
|
|
" vldrh.u16 q4, [%[in2]], #16 \n"
|
|
" vmov.i16 q6, #0x00fc \n"
|
|
" vstrw.32 q6, [sp] \n"
|
|
" vldrh.u16 q5, [%[in1]], #16 \n"
|
|
" wlstp.16 lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
|
|
" vand q6, q4, %[vecMaskR] \n"
|
|
" vmul.i16 q6, q6, %[ratio2x8] \n"
|
|
" vshr.u16 q2, q4, #5 \n"
|
|
" vand q7, q5, %[vecMaskR] \n"
|
|
" vmla.u16 q6, q7, %[ratio1x8] \n"
|
|
" vand q2, q2, %[vecMaskG] \n"
|
|
" vshr.u16 q7, q5, #5 \n"
|
|
" vmul.i16 q2, q2, %[ratio2x4] \n"
|
|
" vand q7, q7, %[vecMaskG] \n"
|
|
" vmla.u16 q2, q7, %[ratio1x4] \n"
|
|
" vshr.u16 q4, q4, #11 \n"
|
|
" vmul.i16 q7, q4, %[ratio2x8] \n"
|
|
" vshr.u16 q5, q5, #11 \n"
|
|
" vshr.u16 q2, q2, #8 \n"
|
|
" vmla.u16 q7, q5, %[ratio1x8] \n"
|
|
|
|
// " vmov.i16 q6, #0x00fc \n"
|
|
" vshr.u16 q7, q7, #8 \n"
|
|
// " vmov.i16 q6, #0x00fc \n"
|
|
/* load 0x00fc instead of mov for better overlap opportunity */
|
|
" vldrw.u32 q4, [sp] \n"
|
|
" vand q2, q2, q4 \n"
|
|
" vmul.i16 q2, q2, %[eight] \n"
|
|
" vand q4, q7, %[vecMaskBpck] \n" // Q7 = vecB0
|
|
" vldrh.u16 q5, [%[in1]], #16 \n"
|
|
" vmla.u16 q2, q4, %[twofiftysix] \n"
|
|
// (vecR0 >> 3) >> 8
|
|
" vshr.u16 q6, q6, #11 \n"
|
|
" vldrh.u16 q4, [%[in2]], #16 \n"
|
|
" vorr q2, q2, q6 \n"
|
|
" vstrh.16 q2, [%[out]], #16 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [in1] "+r"(phwSource), [in2] "+r"(phwBackground),
|
|
[out] "+r" (phwDestination), [loopCnt] "+r"(loopCnt)
|
|
: [vecMaskR] "t" (vecMaskR), [vecMaskG] "t" (vecMaskG),
|
|
[vecMaskBpck] "t" (vecMaskBpck),
|
|
[ratio1x8] "r" (ratio1x8), [ratio2x8] "r" (ratio2x8),
|
|
[ratio1x4] "r" (ratio1x4), [ratio2x4] "r" (ratio2x4),
|
|
[eight] "r" (8), [twofiftysix] "r" (256)
|
|
: "q2", "q4", "q5", "q6", "q7", "memory" );
|
|
|
|
#endif
|
|
}
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_cccn888_alpha_blending_direct(const uint32_t *pwSource,
|
|
const uint32_t *pwBackground,
|
|
uint32_t *pwDestination,
|
|
uint32_t wPixelCount,
|
|
uint_fast8_t chRatio)
|
|
{
|
|
#ifdef USE_MVE_INTRINSICS
|
|
int32_t blkCnt;
|
|
uint16_t chRatioCompl = 256 - (uint16_t) chRatio;
|
|
uint16x8_t vecSrc, vecBckg;
|
|
|
|
|
|
vecSrc = vldrbq_u16((uint8_t const *) pwSource);
|
|
pwSource += 2;
|
|
vecBckg = vldrbq_u16((uint8_t const *) pwBackground);
|
|
pwBackground += 2;
|
|
|
|
|
|
blkCnt = wPixelCount;
|
|
do {
|
|
uint16x8_t vecOut;
|
|
|
|
vecOut = vmulq_n_u16(vecSrc, (uint16_t) chRatio);
|
|
vecSrc = vldrbq_u16((uint8_t const *) pwSource);
|
|
pwSource += 2;
|
|
|
|
vecOut = vmlaq_n_u16(vecOut, vecBckg, chRatioCompl);
|
|
vecBckg = vldrbq_u16((uint8_t const *) pwBackground);
|
|
pwBackground += 2;
|
|
|
|
vecOut = vecOut >> 8;
|
|
|
|
vstrbq_u16((uint8_t *) pwDestination, vecOut);
|
|
pwDestination += 2;
|
|
|
|
blkCnt -= 2;
|
|
}
|
|
while (blkCnt > 0);
|
|
|
|
#else /* USE_MVE_INTRINSICS */
|
|
uint16_t chRatioCompl = 256 - (uint16_t) chRatio;
|
|
register unsigned blkCnt __asm("lr") = (wPixelCount * 4);
|
|
|
|
__asm volatile(
|
|
" vldrb.u16 q0, [%[pwSource]], #8 \n"
|
|
" vldrb.u16 q1, [%[pwBackg]], #8 \n"
|
|
|
|
" wlstp.16 lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
" vmul.u16 q2, q0, %[chRatio] \n"
|
|
" vldrb.u16 q0, [%[pwSource]], #8 \n"
|
|
" vmla.u16 q2, q1, %[chRatioCompl] \n"
|
|
" vldrb.u16 q1, [%[pwBackg]], #8 \n"
|
|
" vshr.u16 q2, q2, #8 \n"
|
|
" vstrb.16 q2, [%[pwDest]], #8 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [pwSource] "+l"(pwSource), [pwBackg] "+l"(pwBackground),
|
|
[pwDest] "+l" (pwDestination), [loopCnt] "+r"(blkCnt)
|
|
: [chRatio] "r" (chRatio), [chRatioCompl] "r" (chRatioCompl)
|
|
: "q0", "q1", "q2", "memory" );
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
static
|
|
mve_pred16_t arm_2d_is_point_vec_inside_region_s16(const arm_2d_region_t * ptRegion,
|
|
const arm_2d_point_s16x8_t * ptPoint)
|
|
{
|
|
mve_pred16_t p0 = vcmpgeq(ptPoint->X, ptRegion->tLocation.iX);
|
|
p0 = vcmpgeq_m(ptPoint->Y, ptRegion->tLocation.iY, p0);
|
|
p0 = vcmpltq_m(ptPoint->X, ptRegion->tLocation.iX + ptRegion->tSize.iWidth, p0);
|
|
p0 = vcmpltq_m(ptPoint->Y, ptRegion->tLocation.iY + ptRegion->tSize.iHeight, p0);
|
|
|
|
return p0;
|
|
}
|
|
|
|
static
|
|
mve_pred16_t arm_2d_is_point_vec_inside_region_s32(const arm_2d_region_t * ptRegion,
|
|
const arm_2d_point_s32x4_t * ptPoint)
|
|
{
|
|
mve_pred16_t p0 = vcmpgeq_n_s32(ptPoint->X, ptRegion->tLocation.iX);
|
|
p0 = vcmpgeq_m_n_s32(ptPoint->Y, ptRegion->tLocation.iY, p0);
|
|
p0 = vcmpltq_m_n_s32(ptPoint->X, ptRegion->tLocation.iX + ptRegion->tSize.iWidth, p0);
|
|
p0 = vcmpltq_m_n_s32(ptPoint->Y, ptRegion->tLocation.iY + ptRegion->tSize.iHeight, p0);
|
|
|
|
return p0;
|
|
}
|
|
|
|
|
|
/**
|
|
@brief return 3 vector of 16-bit channels (8-bit widened) taken from a memory reference
|
|
@param[in] pMem pointer to packed 8-bit channel
|
|
@param[out] R vector of 16-bit widened R channel
|
|
@param[out] G vector of 16-bit widened G channel
|
|
@param[out] B vector of 16-bit widened B channel
|
|
*/
|
|
void __arm_2d_unpack_rgb888_from_mem(const uint8_t * pMem, uint16x8_t * R, uint16x8_t * G,
|
|
uint16x8_t * B)
|
|
{
|
|
uint16x8_t sg = vidupq_n_u16(0, 4);
|
|
|
|
*R = vldrbq_gather_offset_u16(pMem, sg);
|
|
*G = vldrbq_gather_offset_u16(pMem + 1, sg);
|
|
*B = vldrbq_gather_offset_u16(pMem + 2, sg);
|
|
}
|
|
|
|
/**
|
|
@brief interleave 3 x 16-bit widened vectors into 8-bit memory reference
|
|
(4th channel untouched)
|
|
@param[in] pMem pointer to packed 8-bit channel
|
|
@param[in] R vector of 16-bit widened R channel
|
|
@param[in] G vector of 16-bit widened G channel
|
|
@param[in] B vector of 16-bit widened B channel
|
|
*/
|
|
void __arm_2d_pack_rgb888_to_mem(uint8_t * pMem, uint16x8_t R, uint16x8_t G, uint16x8_t B)
|
|
{
|
|
uint16x8_t sg = vidupq_n_u16(0, 4);
|
|
|
|
vstrbq_scatter_offset_u16(pMem, sg, R);
|
|
vstrbq_scatter_offset_u16(pMem + 1, sg, G);
|
|
vstrbq_scatter_offset_u16(pMem + 2, sg, B);
|
|
//vstrbq_scatter_offset_u16(pMem + 3, sg, vdupq_n_u16(0));
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
unpack vectors of 8-bit widened pixels read from a input 2D coordinates if fits inside the region of
|
|
interest or alternative target pixel if content matches color mask
|
|
(vector of packed pixels & region of interest name implicit and fixed to respectively
|
|
vTarget and ptOrigValidRegion)
|
|
Update global predictor tracking region fit & color mask comparison.
|
|
*/
|
|
#define __ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vecX, vecY, ptVal8, MaskColour, pGlb) \
|
|
arm_2d_point_s16x8_t vPoint = {.X = vecX,.Y = vecY }; \
|
|
/* set vector predicate if point is inside the region */ \
|
|
mve_pred16_t p = \
|
|
arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint); \
|
|
pGlb |= p; \
|
|
/* prepare vector of point offsets */ \
|
|
int16_t correctionOffset = vminvq_s16(0x7fff, vPoint.Y) - 1; \
|
|
uint16x8_t ptOffs = vPoint.X + (vPoint.Y - correctionOffset) * iOrigStride; \
|
|
\
|
|
/* base pointer update to compensate offset */ \
|
|
uint8_t *pOriginCorrected = pOrigin + (correctionOffset * iOrigStride); \
|
|
/* retrieve all point values */ \
|
|
ptVal8 = \
|
|
vldrbq_gather_offset_z_u16(pOriginCorrected, ptOffs, predTail); \
|
|
\
|
|
/* combine 2 predicates set to true if point is in the region & values */ \
|
|
/* different from color mask */ \
|
|
p = vcmpneq_m_n_u16(ptVal8, MaskColour, p); \
|
|
pGlb |= p; \
|
|
ptVal8 = vpselq_u16(ptVal8, vTarget, p);
|
|
|
|
/**
|
|
unpack vectors of pixels read from a input 2D coordinates if fits inside the region of
|
|
interest or alternative target pixel if content matches color mask
|
|
(vector of packed pixels & region of interest name implicit and fixed to respectively
|
|
vTarget and ptOrigValidRegion)
|
|
Update global predictor tracking region fit & color mask comparison.
|
|
*/
|
|
|
|
#define __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vecX, vecY, R, G, B, MaskColour, pGlb) \
|
|
arm_2d_point_s16x8_t vPoint = {.X = vecX,.Y = vecY }; \
|
|
/* set vector predicate if point is inside the region */ \
|
|
mve_pred16_t p = \
|
|
arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint); \
|
|
pGlb |= p; \
|
|
\
|
|
/* prepare vector of point offsets */ \
|
|
uint16x8_t ptOffs = vPoint.X + vPoint.Y * iOrigStride; \
|
|
/* retrieve all point values */ \
|
|
uint16x8_t ptVal = \
|
|
vldrhq_gather_shifted_offset_z_u16(pOrigin, ptOffs, predTail); \
|
|
\
|
|
/* combine 2 predicates set to true if point is in the region & values different from color mask */\
|
|
p = vcmpneq_m_n_u16(ptVal, MaskColour, p); \
|
|
pGlb |= p; \
|
|
ptVal = vpselq_u16(ptVal, vTarget, p); \
|
|
\
|
|
/* expand channels */ \
|
|
__arm_2d_rgb565_unpack_single_vec(ptVal, &R, &G, &B);
|
|
|
|
|
|
/**
|
|
Same as above but use offset compensation during gather load.
|
|
unpack vectors of pixels read from a input 2D coordinates if fits inside the region of
|
|
interest or alternative target pixel if content matches color mask
|
|
(vector of packed pixels & region of interest name implicit and fixed to respectively
|
|
vTarget and ptOrigValidRegion)
|
|
Update global predictor tracking region fit & color mask comparison.
|
|
*/
|
|
|
|
#define __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vecX, vecY, R, G, B, MaskColour, pGlb) \
|
|
arm_2d_point_s16x8_t vPoint = {.X = vecX,.Y = vecY }; \
|
|
/* set vector predicate if point is inside the region */ \
|
|
mve_pred16_t p = \
|
|
arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint); \
|
|
pGlb |= p; \
|
|
/* prepare vector of point offsets */ \
|
|
int16_t correctionOffset = vminvq_s16(0x7fff, vPoint.Y) - 1; \
|
|
uint16x8_t ptOffs = vPoint.X + (vPoint.Y - correctionOffset) * iOrigStride; \
|
|
\
|
|
/* base pointer update to compensate offset */ \
|
|
uint16_t *pOriginCorrected = pOrigin + (correctionOffset * iOrigStride); \
|
|
/* retrieve all point values */ \
|
|
uint16x8_t ptVal = \
|
|
vldrhq_gather_shifted_offset_z_u16(pOriginCorrected, ptOffs, predTail); \
|
|
\
|
|
/* combine 2 predicates set to true if point is in the region & values different from color mask */\
|
|
p = vcmpneq_m_n_u16(ptVal, MaskColour, p); \
|
|
pGlb |= p; \
|
|
ptVal = vpselq_u16(ptVal, vTarget, p); \
|
|
\
|
|
/* expand channels */ \
|
|
__arm_2d_rgb565_unpack_single_vec(ptVal, &R, &G, &B);
|
|
|
|
|
|
/**
|
|
unpack vectors of 32-bit pixels read from a input 2D coordinates if fits inside the region of
|
|
interest or alternative target pixel if content matches color mask
|
|
16-bit vector processed in 2 parts because of 32-bit requirements, so handles 8 x 32-bit vectors
|
|
(vectors of packed pixels & region of interest name implicit and fixed to respectively
|
|
vTargetLo, vectorHi and ptOrigValidRegion)
|
|
Update 2 global predictors tracking region fit & color mask comparison for 1st and 2nd half.
|
|
*/
|
|
|
|
#define __ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vecX, vecY, R, G, B, MaskColour, pGlbLo, pGlbHi) \
|
|
arm_2d_point_s16x8_t vPoint = {.X = vecX,.Y = vecY }; \
|
|
arm_2d_point_s32x4_t tPointLo, tPointHi; \
|
|
\
|
|
/* split 16-bit point vector into 2 x 32-bit vectors */ \
|
|
vst1q(pscratch16, vPoint.X); \
|
|
tPointLo.X = vldrhq_s32(pscratch16); \
|
|
tPointHi.X = vldrhq_s32(pscratch16 + 4); \
|
|
\
|
|
vst1q(pscratch16, vPoint.Y); \
|
|
tPointLo.Y = vldrhq_s32(pscratch16); \
|
|
tPointHi.Y = vldrhq_s32(pscratch16 + 4); \
|
|
\
|
|
/* 1st half */ \
|
|
\
|
|
/* set vector predicate if point is inside the region */ \
|
|
mve_pred16_t p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointLo); \
|
|
/* prepare vector of point offsets */ \
|
|
uint32x4_t ptOffs = tPointLo.X + tPointLo.Y * iOrigStride; \
|
|
\
|
|
/* retrieve all point values */ \
|
|
uint32x4_t ptVal = vldrwq_gather_shifted_offset_z_u32(pOrigin, ptOffs, predTailLow); \
|
|
\
|
|
/* combine 2 predicates set to true if point is in the region & values different from color mask */\
|
|
p = vcmpneq_m_n_u32(ptVal, MaskColour, p); \
|
|
pGlbLo |= p; \
|
|
ptVal = vpselq_u32(ptVal, vTargetLo, p); \
|
|
\
|
|
vst1q(scratch32, ptVal); \
|
|
\
|
|
/* 2nd half */ \
|
|
\
|
|
/* set vector predicate if point is inside the region */ \
|
|
p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointHi); \
|
|
/* prepare vector of point offsets */ \
|
|
ptOffs = tPointHi.X + tPointHi.Y * iOrigStride; \
|
|
\
|
|
/* retrieve all point values */ \
|
|
ptVal = vldrwq_gather_shifted_offset_z_u32(pOrigin, ptOffs, predTailHigh); \
|
|
\
|
|
/* combine 2 predicates set to true if point is in the region & values different from color mask */\
|
|
p = vcmpneq_m_n_u32(ptVal, MaskColour, p); \
|
|
pGlbHi |= p; \
|
|
ptVal = vpselq_u32(ptVal, vTargetHi, p); \
|
|
\
|
|
vst1q(scratch32 + 4, ptVal); \
|
|
\
|
|
/* expand channels */ \
|
|
__arm_2d_unpack_rgb888_from_mem((uint8_t *) scratch32, &R, &G, &B);
|
|
|
|
|
|
/**
|
|
Alpha blending of a packed vector with 3 vectors of single R, G & B channels
|
|
*/
|
|
|
|
#define __ARM_2D_BLEND_RGB565_TARGET_RGBVEC(chOpacity, vPackedTarget, vAvgR, vAvgG, vAvgB, vBlended) \
|
|
uint16x8_t vTargetR, vTargetG, vTargetB; \
|
|
\
|
|
__arm_2d_rgb565_unpack_single_vec(vTarget, &vTargetR, &vTargetG, &vTargetB); \
|
|
\
|
|
uint16_t chOpacityCompl = (256 - chOpacity); \
|
|
\
|
|
/* merge */ \
|
|
vAvgR = vAvgR * chOpacityCompl + vTargetR * chOpacity; \
|
|
vAvgR = vAvgR >> 8; \
|
|
\
|
|
vAvgG = vAvgG * chOpacityCompl + vTargetG * chOpacity; \
|
|
vAvgG = vAvgG >> 8; \
|
|
\
|
|
vAvgB = vAvgB * chOpacityCompl + vTargetB * chOpacity; \
|
|
vAvgB = vAvgB >> 8; \
|
|
\
|
|
vBlended = __arm_2d_rgb565_pack_single_vec(vAvgR, vAvgG, vAvgB);
|
|
|
|
|
|
#if __ARM_2D_HAS_HELIUM_FLOAT__ \
|
|
&& !__ARM_2D_CFG_FORCED_FIXED_POINT_ROTATION__
|
|
|
|
#define __CALIB 0.009f16
|
|
|
|
/**
|
|
Scale Gray8 channel
|
|
*/
|
|
#define __ARM_2D_SCALE_GRAY8VEC(vAvgPixel, ptVal8, vScal) \
|
|
vAvgPixel = vScal * vcvtq_f16_s16(ptVal8);
|
|
|
|
/**
|
|
Scale Gray8 channel with accumulation
|
|
*/
|
|
#define __ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vScal) \
|
|
vAvgPixel += vScal * vcvtq_f16_s16(ptVal8);
|
|
|
|
/**
|
|
Scale R, G & B channels
|
|
*/
|
|
#define __ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vScal) \
|
|
vAvgPixelR = vScal * vcvtq_f16_s16(R); \
|
|
vAvgPixelG = vScal * vcvtq_f16_s16(G); \
|
|
vAvgPixelB = vScal * vcvtq_f16_s16(B);
|
|
|
|
/**
|
|
Scale R, G & B channels with accumulation
|
|
*/
|
|
|
|
#define __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vScal) \
|
|
vAvgPixelR += vScal * vcvtq_f16_s16(R); \
|
|
vAvgPixelG += vScal * vcvtq_f16_s16(G); \
|
|
vAvgPixelB += vScal * vcvtq_f16_s16(B);
|
|
|
|
|
|
static
|
|
bool __arm_2d_rotate_regression(arm_2d_size_t * __RESTRICT ptCopySize,
|
|
arm_2d_location_t * pSrcPoint,
|
|
float fAngle,
|
|
arm_2d_location_t * tOffset,
|
|
arm_2d_location_t * center,
|
|
int32_t iOrigStride,
|
|
arm_2d_rot_linear_regr_t regrCoefs[]
|
|
)
|
|
{
|
|
int32_t iHeight = ptCopySize->iHeight;
|
|
int32_t iWidth = ptCopySize->iWidth;
|
|
float invHeight = iHeight > 1 ? 1.0f / (float) (iHeight - 1) : __LARGEINVF32;
|
|
arm_2d_point_s32x4_t vPointCornerI;
|
|
int32x4_t vCornerX = { 0, 1, 0, 1 };
|
|
int32x4_t vCornerY = { 0, 0, 1, 1 };
|
|
float cosAngle = arm_cos_f32(fAngle);
|
|
float sinAngle = arm_sin_f32(fAngle);
|
|
arm_2d_point_float_t centerf;
|
|
float slopeX, slopeY;
|
|
bool gatherLoadIdxOverflow = 0;
|
|
|
|
//printf("invHeight %f\n",invHeight);
|
|
|
|
|
|
centerf.fX = (float) center->iX;
|
|
centerf.fY = (float) center->iY;
|
|
|
|
vPointCornerI.X = vdupq_n_s32(pSrcPoint->iX + tOffset->iX);
|
|
vPointCornerI.X = vPointCornerI.X + vmulq_n_s32(vCornerX, (iWidth - 1));
|
|
|
|
vPointCornerI.Y = vdupq_n_s32(pSrcPoint->iY + tOffset->iY);
|
|
vPointCornerI.Y = vPointCornerI.Y + vmulq_n_s32(vCornerY, (iHeight - 1));
|
|
|
|
/*
|
|
Vector version of:
|
|
|
|
int16_t iX = ptLocation->iX - ptCenter->iX;
|
|
int16_t iY = ptLocation->iY - ptCenter->iY;
|
|
|
|
float cosAngle = arm_cos_f32(fAngle);
|
|
float sinAngle = arm_sin_f32(fAngle);
|
|
|
|
ptOutBuffer->fY = (iY * cosAngle + iX * sinAngle + ptCenter->iY);
|
|
ptOutBuffer->fX = (-iY * sinAngle + iX * cosAngle + ptCenter->iX);
|
|
*/
|
|
|
|
arm_2d_point_f32x4_t vTmp, vPointCornerF;
|
|
|
|
vTmp.X = vsubq_n_f32(vcvtq_f32_s32(vPointCornerI.X), centerf.fX);
|
|
vTmp.Y = vsubq_n_f32(vcvtq_f32_s32(vPointCornerI.Y), centerf.fY);
|
|
|
|
vPointCornerF.X = vmulq_n_f32(vTmp.X, cosAngle) - vmulq_n_f32(vTmp.Y, sinAngle);
|
|
vPointCornerF.X = vaddq_n_f32(vPointCornerF.X, centerf.fX);
|
|
|
|
vPointCornerF.Y = vmulq_n_f32(vTmp.X, sinAngle) + vmulq_n_f32(vTmp.Y, cosAngle);
|
|
vPointCornerF.Y = vaddq_n_f32(vPointCornerF.Y, centerf.fY);
|
|
|
|
/*
|
|
Check whether rotated index offsets could exceed 16-bit limits
|
|
used in subsequent gather loads
|
|
This will occur for parts of large images (e.g. 320*200)
|
|
To avoid unconditional penalties for small/medium images,
|
|
returns a speculative overflow allowing to handle large offsets.
|
|
*/
|
|
float32_t maxY = vmaxnmvq(0.0f, vPointCornerF.Y);
|
|
|
|
if((iOrigStride * maxY) > (float)(UINT16_MAX))
|
|
gatherLoadIdxOverflow = true;
|
|
|
|
|
|
/* interpolation in Y direction for 1st elements column */
|
|
slopeX = (vPointCornerF.X[2] - vPointCornerF.X[0]) * invHeight;
|
|
slopeY = (vPointCornerF.Y[2] - vPointCornerF.Y[0]) * invHeight;
|
|
|
|
regrCoefs[0].slopeY = slopeY;
|
|
regrCoefs[0].slopeX = slopeX;
|
|
regrCoefs[0].interceptY = vPointCornerF.Y[0];
|
|
regrCoefs[0].interceptX = vPointCornerF.X[0];
|
|
|
|
|
|
/* interpolation in Y direction for the last elements column */
|
|
slopeX = (vPointCornerF.X[3] - vPointCornerF.X[1]) * invHeight;
|
|
slopeY = (vPointCornerF.Y[3] - vPointCornerF.Y[1]) * invHeight;
|
|
|
|
regrCoefs[1].slopeY = slopeY;
|
|
regrCoefs[1].slopeX = slopeX;
|
|
regrCoefs[1].interceptY = vPointCornerF.Y[1];
|
|
regrCoefs[1].interceptX = vPointCornerF.X[1];
|
|
|
|
return gatherLoadIdxOverflow;
|
|
}
|
|
|
|
|
|
|
|
|
|
static
|
|
void __arm_2d_impl_gray8_get_pixel_colour(arm_2d_point_f16x8_t
|
|
* ptPoint,
|
|
arm_2d_region_t *
|
|
ptOrigValidRegion,
|
|
uint8_t * pOrigin,
|
|
int16_t iOrigStride,
|
|
uint8_t * pTarget,
|
|
uint8_t MaskColour, uint32_t elts)
|
|
{
|
|
mve_pred16_t predTail = vctp16q(elts);
|
|
uint16x8_t vTarget = vldrbq_u16(pTarget);
|
|
|
|
#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__
|
|
float16x8_t vOne = vdupq_n_f16(1.0f);
|
|
int16x8_t vXi = vcvtq_s16_f16(ptPoint->X);
|
|
int16x8_t vYi = vcvtq_s16_f16(ptPoint->Y);
|
|
|
|
vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_f16(ptPoint->X, 0));
|
|
vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_f16(ptPoint->Y, 0));
|
|
|
|
float16x8_t vWX = ptPoint->X - vcvtq_f16_s16(vXi);
|
|
float16x8_t vWY = ptPoint->Y - vcvtq_f16_s16(vYi);
|
|
|
|
/* combination of Bottom / Top & Left / Right areas contributions */
|
|
float16x8_t vAreaTR = vWX * vWY;
|
|
float16x8_t vAreaTL = (vOne - vWX) * vWY;
|
|
float16x8_t vAreaBR = vWX * (vOne - vWY);
|
|
float16x8_t vAreaBL = (vOne - vWX) * (vOne - vWY);
|
|
|
|
/* accumulated pixel vectors */
|
|
float16x8_t vAvgPixel;
|
|
|
|
/* predicate accumulator */
|
|
/* tracks all predications conditions for selecting final */
|
|
/* averaged pixed / target pixel */
|
|
/* can probably optimized away since averaged target pixel is */
|
|
/* equivalent to original pixel, but precision limitations would introduce small errors */
|
|
mve_pred16_t predGlb = 0;
|
|
|
|
/*
|
|
* accumulate / average over the 4 neigbouring pixels
|
|
*/
|
|
|
|
uint16x8_t ptVal8;
|
|
/* Bottom Left averaging */
|
|
{
|
|
__ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vXi, vYi, ptVal8, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_GRAY8VEC(vAvgPixel, ptVal8, vAreaBL);
|
|
}
|
|
|
|
/* Bottom Right averaging */
|
|
{
|
|
__ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vYi, ptVal8, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaBR);
|
|
}
|
|
|
|
/* Top Left averaging */
|
|
{
|
|
__ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vXi, vaddq_n_s16(vYi, 1),
|
|
ptVal8, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaTL);
|
|
}
|
|
|
|
/* Top Right averaging */
|
|
{
|
|
__ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
|
|
ptVal8, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaTR);
|
|
}
|
|
|
|
|
|
/* pack */
|
|
uint16x8_t TempPixel = vcvtq_s16_f16(vAvgPixel);
|
|
|
|
/* select between target pixel, averaged pixed */
|
|
vTarget = vpselq_u16(TempPixel, vTarget, predGlb);
|
|
|
|
#else
|
|
arm_2d_point_s16x8_t vPoint = {
|
|
.X = vcvtq_s16_f16(ptPoint->X),
|
|
.Y = vcvtq_s16_f16(ptPoint->Y)
|
|
};
|
|
/* set vector predicate if point is inside the region */
|
|
mve_pred16_t p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);
|
|
|
|
/* prepare vector of point offsets */
|
|
/* correctionOffset avoid 16-bit overflow */
|
|
int16_t correctionOffset = vminvq_s16(0x7fff, vPoint.Y) - 1;
|
|
uint16x8_t ptOffs = vPoint.X + (vPoint.Y - correctionOffset) * iOrigStride;
|
|
|
|
/* base pointer update to compensate offset */
|
|
pOrigin += (correctionOffset * iOrigStride);
|
|
|
|
/* retrieve all point values */
|
|
uint16x8_t ptVal = vldrbq_gather_offset_z_u16(pOrigin, ptOffs, predTail);
|
|
|
|
/* combine 2 predicates set to true if point is in the region & values different from color mask */
|
|
vTarget = vpselq_u16(ptVal, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));
|
|
|
|
#endif
|
|
|
|
vstrbq_p_u16(pTarget, vTarget, predTail);
|
|
}
|
|
|
|
|
|
|
|
static
|
|
void __arm_2d_impl_gray8_get_pixel_colour_with_alpha(arm_2d_point_f16x8_t * ptPoint,
|
|
arm_2d_region_t * ptOrigValidRegion,
|
|
uint8_t * pOrigin,
|
|
int16_t iOrigStride,
|
|
uint8_t * pTarget,
|
|
uint8_t MaskColour,
|
|
uint8_t chOpacity, uint32_t elts)
|
|
{
|
|
mve_pred16_t predTail = vctp16q(elts);
|
|
uint16x8_t vTarget = vldrbq_u16(pTarget);
|
|
|
|
#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__
|
|
|
|
float16x8_t vOne = vdupq_n_f16(1.0f);
|
|
int16x8_t vXi = vcvtq_s16_f16(ptPoint->X);
|
|
int16x8_t vYi = vcvtq_s16_f16(ptPoint->Y);
|
|
|
|
vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_f16(ptPoint->X, 0));
|
|
vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_f16(ptPoint->Y, 0));
|
|
|
|
float16x8_t vWX = ptPoint->X - vcvtq_f16_s16(vXi);
|
|
float16x8_t vWY = ptPoint->Y - vcvtq_f16_s16(vYi);
|
|
|
|
/* combination of Bottom / Top & Left / Right areas contributions */
|
|
float16x8_t vAreaTR = vWX * vWY;
|
|
float16x8_t vAreaTL = (vOne - vWX) * vWY;
|
|
float16x8_t vAreaBR = vWX * (vOne - vWY);
|
|
float16x8_t vAreaBL = (vOne - vWX) * (vOne - vWY);
|
|
|
|
/* accumulated pixel vectors */
|
|
float16x8_t vAvgPixel;
|
|
|
|
/* predicate accumulator */
|
|
/* tracks all predications conditions for selecting final */
|
|
/* averaged pixed / target pixel */
|
|
/* can probably optimized away since averaged target pixel is */
|
|
/* equivalent to original pixel, but precision limitations would introduce small errors */
|
|
mve_pred16_t predGlb = 0;
|
|
|
|
/*
|
|
* accumulate / average over the 4 neigbouring pixels
|
|
*/
|
|
|
|
uint16x8_t ptVal8;
|
|
/* Bottom Left averaging */
|
|
{
|
|
__ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vXi, vYi, ptVal8, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_GRAY8VEC(vAvgPixel, ptVal8, vAreaBL);
|
|
}
|
|
|
|
/* Bottom Right averaging */
|
|
{
|
|
__ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vYi,
|
|
ptVal8, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaBR);
|
|
}
|
|
|
|
/* Top Left averaging */
|
|
{
|
|
__ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vXi, vaddq_n_s16(vYi, 1),
|
|
ptVal8, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaTL);
|
|
}
|
|
|
|
/* Top Right averaging */
|
|
{
|
|
__ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
|
|
ptVal8, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaTR);
|
|
}
|
|
|
|
|
|
|
|
/* blending */
|
|
uint16x8_t vAvg = vcvtq_s16_f16(vAvgPixel);
|
|
|
|
uint16_t chTransparency = 256 - (chOpacity);
|
|
uint16x8_t vBlended = (vAvg * chTransparency + vTarget * chOpacity) >> 8;
|
|
|
|
|
|
/* select between target pixel, averaged pixed */
|
|
vTarget = vpselq_u16(vBlended, vTarget, predGlb);
|
|
#else
|
|
/* set vector predicate if point is inside the region */
|
|
arm_2d_point_s16x8_t vPoint = {
|
|
.X = vcvtq_s16_f16(ptPoint->X),
|
|
.Y = vcvtq_s16_f16(ptPoint->Y)
|
|
};
|
|
/* set vector predicate if point is inside the region */
|
|
mve_pred16_t p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);
|
|
/* prepare vector of point offsets */
|
|
/* correctionOffset avoid 16-bit overflow */
|
|
int16_t correctionOffset = vminvq_s16(0x7fff, vPoint.Y) - 1;
|
|
|
|
uint16x8_t ptOffs = vPoint.X + (vPoint.Y - correctionOffset) * iOrigStride;
|
|
|
|
/* retrieve all point values */
|
|
/* base pointer update to compensate offset */
|
|
pOrigin += (correctionOffset * iOrigStride);
|
|
|
|
uint16x8_t ptVal = vldrbq_gather_offset_z_u16(pOrigin, ptOffs, predTail);
|
|
|
|
/* alpha blending */
|
|
uint16_t chTransparency = 256 - (chOpacity);
|
|
uint16x8_t vBlended = (ptVal * chTransparency + vTarget * chOpacity) >> 8;
|
|
|
|
|
|
|
|
/* combine 2 predicates, set to true, if point is in the region & values different from color mask */
|
|
vTarget = vpselq_u16(vBlended, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));
|
|
|
|
#endif
|
|
|
|
vstrbq_p_u16(pTarget, vTarget, predTail);
|
|
}
|
|
|
|
|
|
static
|
|
void __arm_2d_impl_rgb565_get_pixel_colour(arm_2d_point_f16x8_t * ptPoint,
|
|
arm_2d_region_t * ptOrigValidRegion,
|
|
uint16_t * pOrigin,
|
|
int16_t iOrigStride,
|
|
uint16_t * pTarget,
|
|
uint16_t MaskColour, uint32_t elts)
|
|
{
|
|
mve_pred16_t predTail = vctp16q(elts);
|
|
uint16x8_t vTarget = vld1q(pTarget);
|
|
|
|
#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__
|
|
float16x8_t vOne = vdupq_n_f16(1.0f);
|
|
int16x8_t vXi = vcvtq_s16_f16(ptPoint->X);
|
|
int16x8_t vYi = vcvtq_s16_f16(ptPoint->Y);
|
|
|
|
vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_f16(ptPoint->X, 0));
|
|
vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_f16(ptPoint->Y, 0));
|
|
|
|
float16x8_t vWX = ptPoint->X - vcvtq_f16_s16(vXi);
|
|
float16x8_t vWY = ptPoint->Y - vcvtq_f16_s16(vYi);
|
|
|
|
|
|
/* combination of Bottom / Top & Left / Right areas contributions */
|
|
float16x8_t vAreaTR = vWX * vWY;
|
|
float16x8_t vAreaTL = (vOne - vWX) * vWY;
|
|
float16x8_t vAreaBR = vWX * (vOne - vWY);
|
|
float16x8_t vAreaBL = (vOne - vWX) * (vOne - vWY);
|
|
|
|
/* accumulated pixel vectors */
|
|
float16x8_t vAvgPixelR, vAvgPixelG, vAvgPixelB;
|
|
|
|
/* predicate accumulator */
|
|
/* tracks all predications conditions for selecting final */
|
|
/* averaged pixed / target pixel */
|
|
/* can probably optimized away since averaged target pixel is */
|
|
/* equivalent to original pixel, but precision limitations would introduce small errors */
|
|
mve_pred16_t predGlb = 0;
|
|
|
|
/*
|
|
* accumulate / average over the 4 neigbouring pixels
|
|
*/
|
|
uint16x8_t R, G, B;
|
|
/* Bottom Left averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vXi, vYi, R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
|
|
}
|
|
|
|
|
|
/* Bottom Right averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vYi, R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
|
|
}
|
|
|
|
/* Top Left averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vXi, vaddq_n_s16(vYi, 1), R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
|
|
}
|
|
|
|
/* Top Right averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
|
|
R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
|
|
}
|
|
|
|
|
|
/* pack */
|
|
uint16x8_t TempPixel = __arm_2d_rgb565_pack_single_vec(vcvtq_s16_f16(vAvgPixelR),
|
|
vcvtq_s16_f16(vAvgPixelG),
|
|
vcvtq_s16_f16(vAvgPixelB));
|
|
|
|
/* select between target pixel, averaged pixed */
|
|
vTarget = vpselq_u16(TempPixel, vTarget, predGlb);
|
|
|
|
#else
|
|
arm_2d_point_s16x8_t vPoint = {
|
|
.X = vcvtq_s16_f16(ptPoint->X),
|
|
.Y = vcvtq_s16_f16(ptPoint->Y) };
|
|
|
|
/* set vector predicate if point is inside the region */
|
|
mve_pred16_t p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);
|
|
/* prepare vector of point offsets */
|
|
uint16x8_t ptOffs = vPoint.X + vPoint.Y * iOrigStride;
|
|
/* retrieve all point values */
|
|
uint16x8_t ptVal = vldrhq_gather_shifted_offset_z_u16(pOrigin, ptOffs, predTail);
|
|
|
|
/* combine 2 predicates set to true if point is in the region & values different from color mask */
|
|
vTarget = vpselq_u16(ptVal, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));
|
|
|
|
#endif
|
|
|
|
/* update target pixels */
|
|
vst1q_p(pTarget, vTarget, predTail);
|
|
}
|
|
|
|
|
|
static
|
|
void __arm_2d_impl_rgb565_get_pixel_colour_offs_compensated(arm_2d_point_f16x8_t *
|
|
ptPoint,
|
|
arm_2d_region_t *
|
|
ptOrigValidRegion,
|
|
uint16_t * pOrigin,
|
|
int16_t iOrigStride,
|
|
uint16_t * pTarget,
|
|
uint16_t MaskColour,
|
|
uint32_t elts)
|
|
{
|
|
mve_pred16_t predTail = vctp16q(elts);
|
|
uint16x8_t vTarget = vld1q(pTarget);
|
|
|
|
#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__
|
|
float16x8_t vOne = vdupq_n_f16(1.0f);
|
|
int16x8_t vXi = vcvtq_s16_f16(ptPoint->X);
|
|
int16x8_t vYi = vcvtq_s16_f16(ptPoint->Y);
|
|
|
|
vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_f16(ptPoint->X, 0));
|
|
vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_f16(ptPoint->Y, 0));
|
|
|
|
float16x8_t vWX = ptPoint->X - vcvtq_f16_s16(vXi);
|
|
float16x8_t vWY = ptPoint->Y - vcvtq_f16_s16(vYi);
|
|
|
|
/* combination of Bottom / Top & Left / Right areas contributions */
|
|
float16x8_t vAreaTR = vWX * vWY;
|
|
float16x8_t vAreaTL = (vOne - vWX) * vWY;
|
|
float16x8_t vAreaBR = vWX * (vOne - vWY);
|
|
float16x8_t vAreaBL = (vOne - vWX) * (vOne - vWY);
|
|
|
|
/* accumulated pixel vectors */
|
|
float16x8_t vAvgPixelR, vAvgPixelG, vAvgPixelB;
|
|
|
|
/* predicate accumulator */
|
|
/* tracks all predications conditions for selecting final */
|
|
/* averaged pixed / target pixel */
|
|
/* can probably optimized away since averaged target pixel is */
|
|
/* equivalent to original pixel, but precision limitations would introduce small errors */
|
|
mve_pred16_t predGlb = 0;
|
|
|
|
/*
|
|
* accumulate / average over the 4 neigbouring pixels
|
|
*/
|
|
|
|
uint16x8_t R, G, B;
|
|
/* Bottom Left averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vXi, vYi, R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
|
|
}
|
|
|
|
/* Bottom Right averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vYi,
|
|
R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
|
|
}
|
|
|
|
/* Top Left averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vXi, vaddq_n_s16(vYi, 1),
|
|
R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
|
|
}
|
|
|
|
/* Top Right averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
|
|
R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
|
|
}
|
|
|
|
|
|
/* pack */
|
|
uint16x8_t TempPixel = __arm_2d_rgb565_pack_single_vec(vcvtq_s16_f16(vAvgPixelR),
|
|
vcvtq_s16_f16(vAvgPixelG),
|
|
vcvtq_s16_f16(vAvgPixelB));
|
|
|
|
/* select between target pixel, averaged pixed */
|
|
vTarget = vpselq_u16(TempPixel, vTarget, predGlb);
|
|
|
|
#else
|
|
arm_2d_point_s16x8_t vPoint = {
|
|
.X = vcvtq_s16_f16(ptPoint->X),
|
|
.Y = vcvtq_s16_f16(ptPoint->Y) };
|
|
/* set vector predicate if point is inside the region */
|
|
mve_pred16_t p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);
|
|
|
|
/* prepare vector of point offsets */
|
|
/* correctionOffset avoid 16-bit overflow */
|
|
int16_t correctionOffset = vminvq_s16(0x7fff, vPoint.Y) - 1;
|
|
uint16x8_t ptOffs = vPoint.X + (vPoint.Y - correctionOffset) * iOrigStride;
|
|
|
|
/* base pointer update to compensate offset */
|
|
pOrigin += (correctionOffset * iOrigStride);
|
|
|
|
/* retrieve all point values */
|
|
uint16x8_t ptVal = vldrhq_gather_shifted_offset_z_u16(pOrigin, ptOffs, predTail);
|
|
|
|
/* combine 2 predicates set to true if point is in the region & values different from color mask */
|
|
vTarget = vpselq_u16(ptVal, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));
|
|
|
|
#endif
|
|
|
|
vst1q_p(pTarget, vTarget, predTail);
|
|
}
|
|
|
|
static
|
|
void __arm_2d_impl_rgb565_get_pixel_colour_with_alpha(
|
|
arm_2d_point_f16x8_t *ptPoint,
|
|
arm_2d_region_t *ptOrigValidRegion,
|
|
uint16_t *pOrigin,
|
|
int16_t iOrigStride,
|
|
uint16_t *pTarget,
|
|
uint16_t MaskColour,
|
|
uint8_t chOpacity,
|
|
uint32_t elts)
|
|
{
|
|
mve_pred16_t predTail = vctp16q(elts);
|
|
uint16x8_t vTarget = vld1q(pTarget);
|
|
|
|
#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__
|
|
|
|
float16x8_t vOne = vdupq_n_f16(1.0f);
|
|
int16x8_t vXi = vcvtq_s16_f16(ptPoint->X);
|
|
int16x8_t vYi = vcvtq_s16_f16(ptPoint->Y);
|
|
|
|
vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_f16(ptPoint->X, 0));
|
|
vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_f16(ptPoint->Y, 0));
|
|
|
|
float16x8_t vWX = ptPoint->X - vcvtq_f16_s16(vXi);
|
|
float16x8_t vWY = ptPoint->Y - vcvtq_f16_s16(vYi);
|
|
|
|
/* combination of Bottom / Top & Left / Right areas contributions */
|
|
float16x8_t vAreaTR = vWX * vWY;
|
|
float16x8_t vAreaTL = (vOne - vWX) * vWY;
|
|
float16x8_t vAreaBR = vWX * (vOne - vWY);
|
|
float16x8_t vAreaBL = (vOne - vWX) * (vOne - vWY);
|
|
|
|
/* accumulated pixel vectors */
|
|
float16x8_t vAvgPixelR, vAvgPixelG, vAvgPixelB;
|
|
|
|
/* predicate accumulator */
|
|
/* tracks all predications conditions for selecting final */
|
|
/* averaged pixed / target pixel */
|
|
/* can probably optimized away since averaged target pixel is */
|
|
/* equivalent to original pixel, but precision limitations would introduce small errors */
|
|
mve_pred16_t predGlb = 0;
|
|
|
|
/*
|
|
* accumulate / average over the 4 neigbouring pixels
|
|
*/
|
|
|
|
uint16x8_t R, G, B;
|
|
/* Bottom Left averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vXi, vYi, R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
|
|
}
|
|
|
|
|
|
/* Bottom Right averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vYi, R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
|
|
}
|
|
|
|
/* Top Left averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vXi, vaddq_n_s16(vYi, 1), R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
|
|
}
|
|
|
|
/* Top Right averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
|
|
R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
|
|
}
|
|
|
|
|
|
/* blending */
|
|
uint16x8_t vAvgR, vAvgG, vAvgB;
|
|
|
|
vAvgR = vcvtq_s16_f16(vAvgPixelR);
|
|
vAvgG = vcvtq_s16_f16(vAvgPixelG);
|
|
vAvgB = vcvtq_s16_f16(vAvgPixelB);
|
|
|
|
uint16x8_t vBlended;
|
|
|
|
__ARM_2D_BLEND_RGB565_TARGET_RGBVEC(chOpacity, vTarget, vAvgR, vAvgG, vAvgB, vBlended);
|
|
|
|
/* select between target pixel, averaged pixed */
|
|
vTarget = vpselq_u16(vBlended, vTarget, predGlb);
|
|
|
|
#else
|
|
/* set vector predicate if point is inside the region */
|
|
arm_2d_point_s16x8_t vPoint = {
|
|
.X = vcvtq_s16_f16(ptPoint->X),
|
|
.Y = vcvtq_s16_f16(ptPoint->Y) };
|
|
|
|
mve_pred16_t p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);
|
|
/* prepare vector of point offsets */
|
|
uint16x8_t ptOffs = vPoint.X + vPoint.Y * iOrigStride;
|
|
/* retrieve all point values */
|
|
uint16x8_t ptVal = vldrhq_gather_shifted_offset_z_u16(pOrigin, ptOffs, predTail);
|
|
|
|
/* alpha blending */
|
|
uint16x8_t vBlended =
|
|
__arm_2d_rgb565_alpha_blending_single_vec(ptVal, vTarget, chOpacity);
|
|
|
|
/* combine 2 predicates, set to true, if point is in the region & values different from color mask */
|
|
vTarget = vpselq_u16(vBlended, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));
|
|
#endif
|
|
|
|
vst1q_p(pTarget, vTarget, predTail);
|
|
}
|
|
|
|
|
|
static
|
|
void __arm_2d_impl_rgb565_get_pixel_colour_with_alpha_offs_compensated(
|
|
arm_2d_point_f16x8_t *ptPoint,
|
|
arm_2d_region_t *ptOrigValidRegion,
|
|
uint16_t *pOrigin,
|
|
int16_t iOrigStride,
|
|
uint16_t *pTarget,
|
|
uint16_t MaskColour,
|
|
uint8_t chOpacity,
|
|
uint32_t elts)
|
|
{
|
|
mve_pred16_t predTail = vctp16q(elts);
|
|
uint16x8_t vTarget = vld1q(pTarget);
|
|
|
|
#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__
|
|
|
|
float16x8_t vOne = vdupq_n_f16(1.0f);
|
|
int16x8_t vXi = vcvtq_s16_f16(ptPoint->X);
|
|
int16x8_t vYi = vcvtq_s16_f16(ptPoint->Y);
|
|
|
|
vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_f16(ptPoint->X, 0));
|
|
vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_f16(ptPoint->Y, 0));
|
|
|
|
float16x8_t vWX = ptPoint->X - vcvtq_f16_s16(vXi);
|
|
float16x8_t vWY = ptPoint->Y - vcvtq_f16_s16(vYi);
|
|
|
|
/* combination of Bottom / Top & Left / Right areas contributions */
|
|
float16x8_t vAreaTR = vWX * vWY;
|
|
float16x8_t vAreaTL = (vOne - vWX) * vWY;
|
|
float16x8_t vAreaBR = vWX * (vOne - vWY);
|
|
float16x8_t vAreaBL = (vOne - vWX) * (vOne - vWY);
|
|
|
|
/* accumulated pixel vectors */
|
|
float16x8_t vAvgPixelR, vAvgPixelG, vAvgPixelB;
|
|
|
|
/* predicate accumulator */
|
|
/* tracks all predications conditions for selecting final */
|
|
/* averaged pixed / target pixel */
|
|
/* can probably optimized away since averaged target pixel is */
|
|
/* equivalent to original pixel, but precision limitations would introduce small errors */
|
|
mve_pred16_t predGlb = 0;
|
|
|
|
/*
|
|
* accumulate / average over the 4 neigbouring pixels
|
|
*/
|
|
|
|
uint16x8_t R, G, B;
|
|
/* Bottom Left averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vXi, vYi, R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
|
|
}
|
|
|
|
|
|
/* Bottom Right averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vYi, R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
|
|
}
|
|
|
|
/* Top Left averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vXi, vaddq_n_s16(vYi, 1), R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
|
|
}
|
|
|
|
/* Top Right averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
|
|
R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
|
|
}
|
|
|
|
|
|
/* blending */
|
|
uint16x8_t vAvgR, vAvgG, vAvgB;
|
|
|
|
vAvgR = vcvtq_s16_f16(vAvgPixelR);
|
|
vAvgG = vcvtq_s16_f16(vAvgPixelG);
|
|
vAvgB = vcvtq_s16_f16(vAvgPixelB);
|
|
|
|
uint16x8_t vBlended;
|
|
|
|
__ARM_2D_BLEND_RGB565_TARGET_RGBVEC(chOpacity, vTarget, vAvgR, vAvgG, vAvgB, vBlended);
|
|
|
|
/* select between target pixel, averaged pixed */
|
|
vTarget = vpselq_u16(vBlended, vTarget, predGlb);
|
|
#else
|
|
/* set vector predicate if point is inside the region */
|
|
arm_2d_point_s16x8_t vPoint = {
|
|
.X = vcvtq_s16_f16(ptPoint->X),
|
|
.Y = vcvtq_s16_f16(ptPoint->Y) };
|
|
/* set vector predicate if point is inside the region */
|
|
mve_pred16_t p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);
|
|
/* prepare vector of point offsets */
|
|
/* correctionOffset avoid 16-bit overflow */
|
|
int16_t correctionOffset = vminvq_s16(0x7fff, vPoint.Y) - 1;
|
|
|
|
uint16x8_t ptOffs =
|
|
vPoint.X + (vPoint.Y - correctionOffset) * iOrigStride;
|
|
|
|
/* retrieve all point values */
|
|
/* base pointer update to compensate offset */
|
|
pOrigin += (correctionOffset * iOrigStride);
|
|
|
|
uint16x8_t ptVal = vldrhq_gather_shifted_offset_z_u16(pOrigin, ptOffs, predTail);
|
|
|
|
|
|
/* alpha blending */
|
|
uint16x8_t vBlended =
|
|
__arm_2d_rgb565_alpha_blending_single_vec(ptVal, vTarget, chOpacity);
|
|
|
|
/* combine 2 predicates, set to true, if point is in the region & values different from color mask */
|
|
vTarget = vpselq_u16(vBlended, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));
|
|
|
|
#endif
|
|
|
|
vst1q_p(pTarget, vTarget, predTail);
|
|
}
|
|
|
|
|
|
|
|
|
|
static
|
|
void __arm_2d_impl_cccn888_get_pixel_colour( arm_2d_point_f16x8_t *ptPoint,
|
|
arm_2d_region_t *ptOrigValidRegion,
|
|
uint32_t *pOrigin,
|
|
int16_t iOrigStride,
|
|
uint32_t *pTarget,
|
|
uint32_t MaskColour,
|
|
int16_t elts)
|
|
{
|
|
#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__
|
|
ARM_ALIGN(8) uint32_t scratch32[32];
|
|
int16_t *pscratch16 = (int16_t *) scratch32;
|
|
uint32x4_t vTargetLo = vld1q(pTarget);
|
|
uint32x4_t vTargetHi = vld1q(pTarget + 4);
|
|
mve_pred16_t predTailLow = vctp32q(elts);
|
|
mve_pred16_t predTailHigh = elts - 4 > 0 ? vctp32q(elts - 4) : 0;
|
|
float16x8_t vOne = vdupq_n_f16(1.0f);
|
|
|
|
int16x8_t vXi = vcvtq_s16_f16(ptPoint->X);
|
|
int16x8_t vYi = vcvtq_s16_f16(ptPoint->Y);
|
|
|
|
vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_f16(ptPoint->X, 0));
|
|
vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_f16(ptPoint->Y, 0));
|
|
|
|
float16x8_t vWX = ptPoint->X - vcvtq_f16_s16(vXi);
|
|
float16x8_t vWY = ptPoint->Y - vcvtq_f16_s16(vYi);
|
|
|
|
/* combination of Bottom / Top & Left / Right areas contributions */
|
|
float16x8_t vAreaTR = vWX * vWY;
|
|
float16x8_t vAreaTL = (vOne - vWX) * vWY;
|
|
float16x8_t vAreaBR = vWX * (vOne - vWY);
|
|
float16x8_t vAreaBL = (vOne - vWX) * (vOne - vWY);
|
|
|
|
/* accumulated pixel vectors */
|
|
float16x8_t vAvgPixelR, vAvgPixelG, vAvgPixelB;
|
|
|
|
/* predicate accumulators */
|
|
/* tracks all predications conditions for selecting final */
|
|
/* averaged pixed / target pixel */
|
|
/* can probably optimized away since averaged target pixel is */
|
|
/* equivalent to original pixel, but precision limitations would introduce small errors */
|
|
mve_pred16_t predGlbLo = 0, predGlbHi = 0;
|
|
|
|
/*
|
|
* accumulate / average over the 4 neigbouring pixels
|
|
*/
|
|
|
|
uint16x8_t R, G, B;
|
|
|
|
/* Bottom Left averaging */
|
|
{
|
|
__ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vXi, vYi, R, G, B, MaskColour, predGlbLo, predGlbHi);
|
|
|
|
__ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
|
|
}
|
|
|
|
/* Bottom Right averaging */
|
|
{
|
|
__ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vYi,
|
|
R, G, B, MaskColour, predGlbLo, predGlbHi);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
|
|
}
|
|
|
|
/* Top Left averaging */
|
|
{
|
|
__ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vXi, vaddq_n_s16(vYi, 1),
|
|
R, G, B, MaskColour, predGlbLo, predGlbHi);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
|
|
}
|
|
|
|
/* Top Right averaging */
|
|
{
|
|
__ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
|
|
R, G, B, MaskColour, predGlbLo, predGlbHi);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
|
|
}
|
|
|
|
/* pack */
|
|
__arm_2d_pack_rgb888_to_mem((uint8_t *) scratch32, vcvtq_s16_f16(vAvgPixelR),
|
|
vcvtq_s16_f16(vAvgPixelG), vcvtq_s16_f16(vAvgPixelB));
|
|
|
|
uint32x4_t TempPixel = vld1q(scratch32);
|
|
|
|
/* select between target pixel, averaged pixed */
|
|
TempPixel = vpselq_u32(TempPixel, vTargetLo, predGlbLo);
|
|
|
|
vst1q_p(pTarget, TempPixel, predTailLow);
|
|
|
|
TempPixel = vld1q(scratch32 + 4);
|
|
|
|
/* select between target pixel, averaged pixed */
|
|
TempPixel = vpselq_u32(TempPixel, vTargetHi, predGlbHi);
|
|
|
|
vst1q_p(pTarget + 4, TempPixel, predTailHigh);
|
|
#else
|
|
|
|
arm_2d_point_s32x4_t tPointLo, tPointHi;
|
|
ARM_ALIGN(8) int16_t scratch[8];
|
|
mve_pred16_t p;
|
|
|
|
/* split 16-bit point vector into 2 x 32-bit vectors */
|
|
vst1q(scratch, vcvtq_s16_f16(ptPoint->X));
|
|
tPointLo.X = vldrhq_s32(scratch);
|
|
tPointHi.X = vldrhq_s32(scratch + 4);
|
|
|
|
vst1q(scratch, vcvtq_s16_f16(ptPoint->Y));
|
|
tPointLo.Y = vldrhq_s32(scratch);
|
|
tPointHi.Y = vldrhq_s32(scratch + 4);
|
|
|
|
/* 1st half */
|
|
|
|
/* set vector predicate if point is inside the region */
|
|
p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointLo);
|
|
/* prepare vector of point offsets */
|
|
uint32x4_t ptOffs = tPointLo.X + tPointLo.Y * iOrigStride;
|
|
uint32x4_t vPixel = vld1q(pTarget);
|
|
/* retrieve all point values */
|
|
uint32x4_t ptVal = vldrwq_gather_shifted_offset_u32(pOrigin, ptOffs);
|
|
|
|
/* combine 2 predicates set to true if point is in the region & values different from color mask */
|
|
vPixel = vpselq_u32(ptVal, vPixel, vcmpneq_m_n_u32(ptVal, MaskColour, p));
|
|
|
|
vst1q_p(pTarget, vPixel, vctp32q(elts));
|
|
|
|
elts -= 4;
|
|
if (elts > 0) {
|
|
|
|
/* second half */
|
|
p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointHi);
|
|
ptOffs = tPointHi.X + tPointHi.Y * iOrigStride;
|
|
vPixel = vld1q(pTarget + 4);
|
|
|
|
ptVal = vldrwq_gather_shifted_offset_u32(pOrigin, ptOffs);
|
|
vPixel = vpselq_u32(ptVal, vPixel, vcmpneq_m_n_u32(ptVal, MaskColour, p));
|
|
vst1q_p(pTarget + 4, vPixel, vctp32q(elts));
|
|
}
|
|
#endif
|
|
}
|
|
|
|
|
|
|
|
static
|
|
void __arm_2d_impl_cccn888_get_pixel_colour_with_alpha(
|
|
arm_2d_point_f16x8_t *ptPoint,
|
|
arm_2d_region_t *ptOrigValidRegion,
|
|
uint32_t *pOrigin,
|
|
int16_t iOrigStride,
|
|
uint32_t *pTarget,
|
|
uint32_t MaskColour,
|
|
uint8_t chOpacity,
|
|
int16_t elts)
|
|
{
|
|
#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__
|
|
ARM_ALIGN(8) uint32_t scratch32[32];
|
|
int16_t *pscratch16 = (int16_t *) scratch32;
|
|
uint32x4_t vTargetLo = vld1q(pTarget);
|
|
uint32x4_t vTargetHi = vld1q(pTarget + 4);
|
|
mve_pred16_t predTailLow = vctp32q(elts);
|
|
mve_pred16_t predTailHigh = elts - 4 > 0 ? vctp32q(elts - 4) : 0;
|
|
float16x8_t vOne = vdupq_n_f16(1.0f);
|
|
|
|
int16x8_t vXi = vcvtq_s16_f16(ptPoint->X);
|
|
int16x8_t vYi = vcvtq_s16_f16(ptPoint->Y);
|
|
|
|
vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_f16(ptPoint->X, 0));
|
|
vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_f16(ptPoint->Y, 0));
|
|
|
|
float16x8_t vWX = ptPoint->X - vcvtq_f16_s16(vXi);
|
|
float16x8_t vWY = ptPoint->Y - vcvtq_f16_s16(vYi);
|
|
|
|
/* combination of Bottom / Top & Left / Right areas contributions */
|
|
float16x8_t vAreaTR = vWX * vWY;
|
|
float16x8_t vAreaTL = (vOne - vWX) * vWY;
|
|
float16x8_t vAreaBR = vWX * (vOne - vWY);
|
|
float16x8_t vAreaBL = (vOne - vWX) * (vOne - vWY);
|
|
|
|
/* accumulated pixel vectors */
|
|
float16x8_t vAvgPixelR, vAvgPixelG, vAvgPixelB;
|
|
|
|
|
|
/* predicate accumulators */
|
|
/* tracks all predications conditions for selecting final */
|
|
/* averaged pixed / target pixel */
|
|
/* can probably optimized away since averaged target pixel is */
|
|
/* equivalent to original pixel, but precision limitations would introduce small errors */
|
|
mve_pred16_t predGlbLo = 0, predGlbHi = 0;
|
|
|
|
/*
|
|
* accumulate / average over the 4 neigbouring pixels
|
|
*/
|
|
uint16x8_t R, G, B;
|
|
|
|
/* Bottom Left averaging */
|
|
{
|
|
__ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vXi, vYi, R, G, B, MaskColour, predGlbLo, predGlbHi);
|
|
|
|
__ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
|
|
}
|
|
|
|
/* Bottom Right averaging */
|
|
{
|
|
__ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vYi,
|
|
R, G, B, MaskColour, predGlbLo, predGlbHi);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
|
|
}
|
|
|
|
/* Top Left averaging */
|
|
{
|
|
__ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vXi, vaddq_n_s16(vYi, 1),
|
|
R, G, B, MaskColour, predGlbLo, predGlbHi);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
|
|
}
|
|
|
|
/* Top Right averaging */
|
|
{
|
|
__ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
|
|
R, G, B, MaskColour, predGlbLo, predGlbHi);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
|
|
}
|
|
|
|
|
|
/* alpha blending */
|
|
uint16x8_t vTargetR, vTargetG, vTargetB;
|
|
|
|
__arm_2d_unpack_rgb888_from_mem((const uint8_t *) pTarget, &vTargetR, &vTargetG, &vTargetB);
|
|
|
|
uint16_t chOpacityCompl = (256 - chOpacity);
|
|
|
|
uint16x8_t vAvgR, vAvgG, vAvgB;
|
|
|
|
vAvgR = vcvtq_s16_f16(vAvgPixelR);
|
|
vAvgG = vcvtq_s16_f16(vAvgPixelG);
|
|
vAvgB = vcvtq_s16_f16(vAvgPixelB);
|
|
|
|
/* merge */
|
|
vAvgR = vAvgR * chOpacityCompl + vTargetR * chOpacity;
|
|
vAvgR = vAvgR >> 8;
|
|
|
|
vAvgG = vAvgG * chOpacityCompl + vTargetG * chOpacity;
|
|
vAvgG = vAvgG >> 8;
|
|
|
|
vAvgB = vAvgB * chOpacityCompl + vTargetB * chOpacity;
|
|
vAvgB = vAvgB >> 8;
|
|
|
|
|
|
/* pack */
|
|
__arm_2d_pack_rgb888_to_mem((uint8_t *) scratch32, vAvgR, vAvgG, vAvgB);
|
|
|
|
uint32x4_t TempPixel = vld1q(scratch32);
|
|
|
|
/* select between target pixel, averaged pixed */
|
|
TempPixel = vpselq_u32(TempPixel, vTargetLo, predGlbLo);
|
|
|
|
vst1q_p(pTarget, TempPixel, predTailLow);
|
|
|
|
TempPixel = vld1q(scratch32 + 4);
|
|
|
|
/* select between target pixel, averaged pixed */
|
|
TempPixel = vpselq_u32(TempPixel, vTargetHi, predGlbHi);
|
|
|
|
vst1q_p(pTarget + 4, TempPixel, predTailHigh);
|
|
#else
|
|
arm_2d_point_s32x4_t tPointLo, tPointHi;
|
|
ARM_ALIGN(8) int16_t scratch[8];
|
|
ARM_ALIGN(8) uint32_t blendled[4];
|
|
mve_pred16_t p;
|
|
|
|
/* split 16-bit point vector into 2 x 32-bit vectors */
|
|
vst1q(scratch, vcvtq_s16_f16(ptPoint->X));
|
|
tPointLo.X = vldrhq_s32(scratch);
|
|
tPointHi.X = vldrhq_s32(scratch + 4);
|
|
|
|
vst1q(scratch, vcvtq_s16_f16(ptPoint->Y));
|
|
tPointLo.Y = vldrhq_s32(scratch);
|
|
tPointHi.Y = vldrhq_s32(scratch + 4);
|
|
|
|
|
|
/* 1st half */
|
|
|
|
/* set vector predicate if point is inside the region */
|
|
p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointLo);
|
|
/* prepare vector of point offsets */
|
|
uint32x4_t ptOffs = tPointLo.X + tPointLo.Y * iOrigStride;
|
|
uint32x4_t vPixel = vld1q(pTarget);
|
|
/* retrieve all point values */
|
|
uint32x4_t ptVal = vldrwq_gather_shifted_offset_u32(pOrigin, ptOffs);
|
|
|
|
vstrwq_u32((uint32_t *) scratch, ptVal);
|
|
|
|
/* alpha-blending (requires widened inputs) */
|
|
vstrbq_u16((uint8_t *) blendled,
|
|
__rgb888_alpha_blending_direct_single_vec(vldrbq_u16((uint8_t const *) scratch),
|
|
vldrbq_u16((uint8_t const *) pTarget), chOpacity));
|
|
|
|
vstrbq_u16((uint8_t *) blendled + 2,
|
|
__rgb888_alpha_blending_direct_single_vec(vldrbq_u16((uint8_t const *)scratch + 4),
|
|
vldrbq_u16((uint8_t const *)pTarget + 2), chOpacity));
|
|
|
|
uint32x4_t vBlended = vld1q(blendled);
|
|
|
|
/* combine 2 predicates, set to true, if point is in the region & values different from color mask */
|
|
vPixel = vpselq_u32(vBlended, vPixel, vcmpneq_m_n_u32(ptVal, MaskColour, p));
|
|
|
|
vst1q_p(pTarget, vPixel, vctp32q(elts));
|
|
|
|
elts -= 4;
|
|
if(elts > 0) {
|
|
/* second half */
|
|
|
|
p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointHi);
|
|
ptOffs = tPointHi.X + tPointHi.Y * iOrigStride;
|
|
vPixel = vld1q(pTarget);
|
|
ptVal = vldrwq_gather_shifted_offset_u32(pOrigin, ptOffs);
|
|
|
|
vstrwq_u32((uint32_t *) scratch, ptVal);
|
|
|
|
/* alpha-blending (requires widened inputs) */
|
|
vstrbq_u16((uint8_t *) blendled,
|
|
__rgb888_alpha_blending_direct_single_vec(vldrbq_u16((uint8_t const *) scratch),
|
|
vldrbq_u16((uint8_t const *) pTarget), chOpacity));
|
|
vstrbq_u16((uint8_t *) blendled + 2,
|
|
__rgb888_alpha_blending_direct_single_vec(vldrbq_u16((uint8_t const *)scratch + 4),
|
|
vldrbq_u16((uint8_t const *)pTarget + 2), chOpacity));
|
|
|
|
vBlended = vld1q(blendled);
|
|
|
|
/* combine 2 predicates, set to true, if point is in the region & values different from color mask */
|
|
vPixel = vpselq_u32(vBlended, vPixel, vcmpneq_m_n_u32(ptVal, MaskColour, p));
|
|
|
|
vst1q_p(pTarget + 4, vPixel, vctp32q(elts));
|
|
}
|
|
#endif
|
|
}
|
|
|
|
#else /* __ARM_2D_HAS_HELIUM_FLOAT__ && ! __ARM_2D_CFG_FORCED_FIXED_POINT_ROTATION__ */
|
|
|
|
|
|
/* extra calibration removed in fixed-point code since offset is lower than Q9.6 representation */
|
|
|
|
#define ONE_BY_2PI_Q31 341782637.0f
|
|
#define ARSHIFT(x, shift) (shift > 0 ? x >> shift : x << (-shift))
|
|
#define TO_Q16(x) ((x) << 16)
|
|
#define GET_Q6INT(x) ((x) >> 6)
|
|
#define SET_Q6INT(x) ((x) << 6)
|
|
|
|
/**
|
|
Scale Gray8 channel
|
|
*/
|
|
#define __ARM_2D_SCALE_GRAY8VEC(vAvgPixel, ptVal8, vScal) \
|
|
vAvgPixel = vmulhq_u16(vScal, ptVal8);
|
|
|
|
/**
|
|
Scale Gray8 channel with accumulation
|
|
*/
|
|
#define __ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vScal) \
|
|
vAvgPixel += vmulhq_u16(vScal, ptVal8);
|
|
|
|
/**
|
|
Scale R, G & B channels
|
|
*/
|
|
#define __ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vScal) \
|
|
vAvgPixelR = vmulhq_u16(vScal, R); \
|
|
vAvgPixelG = vmulhq_u16(vScal, G); \
|
|
vAvgPixelB = vmulhq_u16(vScal, B);
|
|
/**
|
|
Scale R, G & B channels with accumulation
|
|
*/
|
|
#define __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vScal) \
|
|
vAvgPixelR += vmulhq_u16(vScal, R); \
|
|
vAvgPixelG += vmulhq_u16(vScal, G); \
|
|
vAvgPixelB += vmulhq_u16(vScal, B);
|
|
|
|
|
|
#ifdef VECTORIZED_ROTATION_REGR
|
|
/* disabled as slower than scalar */
|
|
static
|
|
bool __arm_2d_rotate_regression(arm_2d_size_t * __RESTRICT ptCopySize,
|
|
arm_2d_location_t * pSrcPoint,
|
|
float fAngle,
|
|
arm_2d_location_t * tOffset,
|
|
arm_2d_location_t * center,
|
|
int32_t iOrigStride,
|
|
arm_2d_rot_linear_regr_t regrCoefs[]
|
|
)
|
|
{
|
|
int32_t iHeight = ptCopySize->iHeight;
|
|
int32_t iWidth = ptCopySize->iWidth;
|
|
q31_t invHeightFx = 0x7fffffff / (iHeight - 1);
|
|
arm_2d_point_s32x4_t vPointCornerI;
|
|
int32_t AngleFx = (int32_t) roundf(fAngle * ONE_BY_2PI_Q31);
|
|
q31_t cosAngleFx = arm_cos_q31(AngleFx);
|
|
q31_t sinAngleFx = arm_sin_q31(AngleFx);
|
|
int32x4_t vCornerX = { 0, 1, 0, 1 };
|
|
int32x4_t vCornerY = { 0, 0, 1, 1 };
|
|
bool gatherLoadIdxOverflow = 0;
|
|
|
|
vPointCornerI.X = vdupq_n_s32(pSrcPoint->iX + tOffset->iX);
|
|
vPointCornerI.X = vPointCornerI.X + vmulq_n_s32(vCornerX, (iWidth - 1));
|
|
|
|
vPointCornerI.Y = vdupq_n_s32(pSrcPoint->iY + tOffset->iY);
|
|
vPointCornerI.Y = vPointCornerI.Y + vmulq_n_s32(vCornerY, (iHeight - 1));
|
|
|
|
/*
|
|
Vector version of:
|
|
|
|
int16_t iX = ptLocation->iX - ptCenter->iX;
|
|
int16_t iY = ptLocation->iY - ptCenter->iY;
|
|
|
|
q31_t cosAngleFx = arm_cos_q31(fAngle);
|
|
q31_t sinAngleFx = arm_sin_q31(fAngle);
|
|
tPointCornerFx[0][0].Y =
|
|
__QDADD(__QDADD(centerQ16.Y, MULTFX(iYQ16, cosAngleFx)), MULTFX(iXQ16, sinAngleFx));
|
|
tPointCornerFx[0][0].X =
|
|
__QDSUB(__QDADD(centerQ16.X, MULTFX(iXQ16, cosAngleFx)), MULTFX(iYQ16, sinAngleFx));
|
|
|
|
*/
|
|
|
|
arm_2d_point_s32x4_t vTmp1;
|
|
|
|
vTmp1.X = vsubq_n_s16(vPointCornerI.X, center->iX);
|
|
vTmp1.Y = vsubq_n_s16(vPointCornerI.Y, center->iY);
|
|
vTmp1.X <<= 16;
|
|
vTmp1.Y <<= 16;
|
|
|
|
|
|
vPointCornerI.X =
|
|
vqsubq(vqdmulhq_n_s32(vTmp1.X, cosAngleFx), vqdmulhq_n_s32(vTmp1.Y, sinAngleFx));
|
|
vPointCornerI.X = vqaddq_n_s32(vPointCornerI.X, (center->iX << 16));
|
|
|
|
vPointCornerI.Y = vqdmlahq(vqdmulhq_n_s32(vTmp1.X, sinAngleFx), vTmp1.Y, cosAngleFx);
|
|
vPointCornerI.Y = vqaddq_n_s32(vPointCornerI.Y, (center->iY << 16));
|
|
|
|
/*
|
|
Check whether rotated index offsets could exceed 16-bit limits
|
|
used in subsequent gather loads
|
|
This will occur for parts of large images (e.g. 320*200)
|
|
To avoid unconditional penalties for small/medium images,
|
|
returns a speculative overflow allowing to handle large offsets.
|
|
*/
|
|
int32_t maxY = vmaxvq(0.0f, vPointCornerI.Y);
|
|
|
|
if(MULTFX(TO_Q16(iOrigStride), maxY) > UINT16_MAX)
|
|
gatherLoadIdxOverflow = true;
|
|
|
|
|
|
/* regression parameters */
|
|
|
|
vTmp1.X[0] = vPointCornerI.X[0];
|
|
vTmp1.X[1] = vPointCornerI.X[1];
|
|
vTmp1.X[2] = vPointCornerI.Y[0];
|
|
vTmp1.X[3] = vPointCornerI.Y[1];
|
|
|
|
vTmp1.Y[0] = vPointCornerI.X[2];
|
|
vTmp1.Y[1] = vPointCornerI.X[3];
|
|
vTmp1.Y[2] = vPointCornerI.Y[2];
|
|
vTmp1.Y[3] = vPointCornerI.Y[3];
|
|
|
|
/* slopes */
|
|
vTmp1.X = vqdmulhq_n_s32(vTmp1.Y - vTmp1.X, invHeightFx);
|
|
|
|
regrCoefs[0].slopeY = vTmp1.X[2];
|
|
regrCoefs[0].slopeX = vTmp1.X[0];
|
|
regrCoefs[0].interceptY = vPointCornerI.Y[0];
|
|
regrCoefs[0].interceptX = vPointCornerI.X[0];
|
|
|
|
regrCoefs[1].slopeY = vTmp1.X[3];
|
|
regrCoefs[1].slopeX = vTmp1.X[1];
|
|
regrCoefs[1].interceptY = vPointCornerI.Y[1];
|
|
regrCoefs[1].interceptX = vPointCornerI.X[1];
|
|
|
|
return gatherLoadIdxOverflow;
|
|
}
|
|
|
|
#else
|
|
|
|
static
|
|
bool __arm_2d_rotate_regression(arm_2d_size_t * __RESTRICT ptCopySize,
|
|
arm_2d_location_t * pSrcPoint,
|
|
float fAngle,
|
|
arm_2d_location_t * tOffset,
|
|
arm_2d_location_t * center,
|
|
int32_t iOrigStride,
|
|
arm_2d_rot_linear_regr_t regrCoefs[]
|
|
)
|
|
{
|
|
int_fast16_t iHeight = ptCopySize->iHeight;
|
|
int_fast16_t iWidth = ptCopySize->iWidth;
|
|
q31_t invHeightFx = 0x7fffffff / (iHeight - 1);
|
|
int32_t AngleFx = lroundf(fAngle * ONE_BY_2PI_Q31);
|
|
q31_t cosAngleFx = arm_cos_q31(AngleFx);
|
|
q31_t sinAngleFx = arm_sin_q31(AngleFx);
|
|
arm_2d_point_fx_t tPointCornerFx[2][2];
|
|
arm_2d_point_fx_t centerQ16;
|
|
arm_2d_point_fx_t srcPointQ16;
|
|
arm_2d_point_fx_t tOffsetQ16;
|
|
arm_2d_point_fx_t tmp;
|
|
int32_t iXQ16, iYQ16;
|
|
bool gatherLoadIdxOverflow = 0;
|
|
|
|
/* Q16 conversion */
|
|
centerQ16.X = TO_Q16(center->iX);
|
|
centerQ16.Y = TO_Q16(center->iY);
|
|
|
|
srcPointQ16.X = TO_Q16(pSrcPoint->iX);
|
|
srcPointQ16.Y = TO_Q16(pSrcPoint->iY);
|
|
|
|
tOffsetQ16.X = TO_Q16(tOffset->iX);
|
|
tOffsetQ16.Y = TO_Q16(tOffset->iY);
|
|
|
|
|
|
/* (0,0) corner */
|
|
tmp.X = srcPointQ16.X + 0 + tOffsetQ16.X;
|
|
tmp.Y = srcPointQ16.Y + 0 + tOffsetQ16.Y;
|
|
|
|
iXQ16 = tmp.X - centerQ16.X;
|
|
iYQ16 = tmp.Y - centerQ16.Y;
|
|
|
|
tPointCornerFx[0][0].Y =
|
|
__QDADD(__QDADD(centerQ16.Y, MULTFX(iYQ16, cosAngleFx)), MULTFX(iXQ16, sinAngleFx));
|
|
tPointCornerFx[0][0].X =
|
|
__QDSUB(__QDADD(centerQ16.X, MULTFX(iXQ16, cosAngleFx)), MULTFX(iYQ16, sinAngleFx));
|
|
|
|
|
|
/* ((iWidth - 1),0) corner */
|
|
tmp.X = srcPointQ16.X + 0 + tOffsetQ16.X + TO_Q16(iWidth - 1);
|
|
iXQ16 = tmp.X - centerQ16.X;
|
|
|
|
tPointCornerFx[1][0].Y =
|
|
__QDADD(__QDADD(centerQ16.Y, MULTFX(iYQ16, cosAngleFx)), MULTFX(iXQ16, sinAngleFx));
|
|
tPointCornerFx[1][0].X =
|
|
__QDSUB(__QDADD(centerQ16.X, MULTFX(iXQ16, cosAngleFx)), MULTFX(iYQ16, sinAngleFx));
|
|
|
|
|
|
/* ((iWidth - 1),(iHeight - 1)) corner */
|
|
tmp.Y = srcPointQ16.Y + tOffsetQ16.Y + TO_Q16(iHeight - 1);
|
|
iYQ16 = tmp.Y - centerQ16.Y;
|
|
|
|
tPointCornerFx[1][1].Y =
|
|
__QDADD(__QDADD(centerQ16.Y, MULTFX(iYQ16, cosAngleFx)), MULTFX(iXQ16, sinAngleFx));
|
|
tPointCornerFx[1][1].X =
|
|
__QDSUB(__QDADD(centerQ16.X, MULTFX(iXQ16, cosAngleFx)), MULTFX(iYQ16, sinAngleFx));
|
|
|
|
|
|
/* (0,(iHeight - 1)) corner */
|
|
tmp.X = srcPointQ16.X + 0 + tOffsetQ16.X;
|
|
iXQ16 = tmp.X - centerQ16.X;
|
|
|
|
tPointCornerFx[0][1].Y =
|
|
__QDADD(__QDADD(centerQ16.Y, MULTFX(iYQ16, cosAngleFx)), MULTFX(iXQ16, sinAngleFx));
|
|
tPointCornerFx[0][1].X =
|
|
__QDSUB(__QDADD(centerQ16.X, MULTFX(iXQ16, cosAngleFx)), MULTFX(iYQ16, sinAngleFx));
|
|
/*
|
|
Check whether rotated index offsets could exceed 16-bit limits
|
|
used in subsequent gather loads
|
|
This will occur for parts of large images (e.g. 320*200)
|
|
To avoid unconditional penalties for small/medium images,
|
|
returns a speculative overflow allowing to handle large offsets.
|
|
*/
|
|
int32_t maxY = MAX(MAX
|
|
(MAX(tPointCornerFx[0][0].Y, tPointCornerFx[0][1].Y),
|
|
tPointCornerFx[1][0].Y),
|
|
tPointCornerFx[1][1].Y);
|
|
|
|
if(MULTFX(TO_Q16(iOrigStride), maxY) > UINT16_MAX)
|
|
gatherLoadIdxOverflow = true;
|
|
|
|
|
|
/* regression */
|
|
int32_t slopeXFx, slopeYFx;
|
|
|
|
/* interpolation in Y direction for 1st elements column */
|
|
slopeXFx = MULTFX((tPointCornerFx[0][1].X - tPointCornerFx[0][0].X), invHeightFx);
|
|
slopeYFx = MULTFX((tPointCornerFx[0][1].Y - tPointCornerFx[0][0].Y), invHeightFx);
|
|
|
|
regrCoefs[0].slopeY = slopeYFx * 2;
|
|
regrCoefs[0].slopeX = slopeXFx * 2;
|
|
regrCoefs[0].interceptY = tPointCornerFx[0][0].Y;
|
|
regrCoefs[0].interceptX = tPointCornerFx[0][0].X;
|
|
|
|
|
|
/* interpolation in Y direction for the last elements column */
|
|
slopeXFx = MULTFX((tPointCornerFx[1][1].X - tPointCornerFx[1][0].X), invHeightFx);
|
|
slopeYFx = MULTFX((tPointCornerFx[1][1].Y - tPointCornerFx[1][0].Y), invHeightFx);
|
|
|
|
regrCoefs[1].slopeY = slopeYFx* 2;
|
|
regrCoefs[1].slopeX = slopeXFx* 2;
|
|
regrCoefs[1].interceptY = tPointCornerFx[1][0].Y;
|
|
regrCoefs[1].interceptX = tPointCornerFx[1][0].X;
|
|
|
|
return gatherLoadIdxOverflow;
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
static
|
|
void __arm_2d_impl_gray8_get_pixel_colour(arm_2d_point_s16x8_t * ptPoint,
|
|
arm_2d_region_t * ptOrigValidRegion,
|
|
uint8_t * pOrigin,
|
|
int16_t iOrigStride,
|
|
uint8_t * pTarget,
|
|
uint8_t MaskColour, uint32_t elts)
|
|
{
|
|
mve_pred16_t predTail = vctp16q(elts);
|
|
uint16x8_t vTarget = vldrbq_u16(pTarget);
|
|
|
|
#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__
|
|
int16x8_t vOne = vdupq_n_s16(SET_Q6INT(1));
|
|
int16x8_t vXi = GET_Q6INT(ptPoint->X);
|
|
int16x8_t vYi = GET_Q6INT(ptPoint->Y);
|
|
|
|
vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_s16(ptPoint->X, 0));
|
|
vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_s16(ptPoint->Y, 0));
|
|
|
|
int16x8_t vWX = ptPoint->X - SET_Q6INT(vXi);
|
|
int16x8_t vWY = ptPoint->Y - SET_Q6INT(vYi);
|
|
|
|
/* combination of Bottom / Top & Left / Right areas contributions */
|
|
uint16x8_t vAreaTR = vmulq_u16(vWX, vWY);
|
|
uint16x8_t vAreaTL = vmulq_u16((vOne - vWX), vWY);
|
|
uint16x8_t vAreaBR = vmulq_u16(vWX, (vOne - vWY));
|
|
uint16x8_t vAreaBL = vmulq_u16((vOne - vWX), (vOne - vWY));
|
|
|
|
/* Q16 conversion */
|
|
vAreaTR = vqshlq_n_u16(vAreaTR, 4);
|
|
vAreaTL = vqshlq_n_u16(vAreaTL, 4);
|
|
vAreaBR = vqshlq_n_u16(vAreaBR, 4);
|
|
vAreaBL = vqshlq_n_u16(vAreaBL, 4);
|
|
|
|
|
|
/* accumulated pixel vectors */
|
|
uint16x8_t vAvgPixel;
|
|
|
|
/* predicate accumulator */
|
|
/* tracks all predications conditions for selecting final */
|
|
/* averaged pixed / target pixel */
|
|
/* can probably optimized away since averaged target pixel is */
|
|
/* equivalent to original pixel, but precision limitations would introduce small errors */
|
|
mve_pred16_t predGlb = 0;
|
|
|
|
/*
|
|
* accumulate / average over the 4 neigbouring pixels
|
|
*/
|
|
|
|
uint16x8_t ptVal8;
|
|
/* Bottom Left averaging */
|
|
{
|
|
__ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vXi, vYi, ptVal8, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_GRAY8VEC(vAvgPixel, ptVal8, vAreaBL);
|
|
}
|
|
|
|
|
|
/* Bottom Right averaging */
|
|
{
|
|
__ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vYi, ptVal8, MaskColour,
|
|
predGlb);
|
|
|
|
__ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaBR);
|
|
}
|
|
|
|
/* Top Left averaging */
|
|
{
|
|
__ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vXi, vaddq_n_s16(vYi, 1), ptVal8, MaskColour,
|
|
predGlb);
|
|
|
|
__ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaTL);
|
|
}
|
|
|
|
/* Top Right averaging */
|
|
{
|
|
__ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
|
|
ptVal8, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaTR);
|
|
}
|
|
|
|
|
|
/* select between target pixel, averaged pixed */
|
|
vTarget = vpselq_u16(vAvgPixel, vTarget, predGlb);
|
|
|
|
#else
|
|
/* extract integer part */
|
|
arm_2d_point_s16x8_t vPoint = {
|
|
.X = GET_Q6INT(ptPoint->X),
|
|
.Y = GET_Q6INT(ptPoint->Y)
|
|
};
|
|
|
|
/* set vector predicate if point is inside the region */
|
|
mve_pred16_t p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);
|
|
|
|
/* prepare vector of point offsets */
|
|
/* correctionOffset avoid 16-bit overflow */
|
|
int16_t correctionOffset = vminvq_s16(0x7fff, vPoint.Y) - 1;
|
|
uint16x8_t ptOffs = vPoint.X + (vPoint.Y - correctionOffset) * iOrigStride;
|
|
|
|
/* base pointer update to compensate offset */
|
|
pOrigin += (correctionOffset * iOrigStride);
|
|
|
|
/* retrieve all point values */
|
|
uint16x8_t ptVal = vldrbq_gather_offset_z_u16(pOrigin, ptOffs, predTail);
|
|
|
|
/* combine 2 predicates set to true if point is in the region & values different from color mask */
|
|
vTarget = vpselq_u16(ptVal, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));
|
|
#endif
|
|
|
|
vstrbq_p_u16(pTarget, vTarget, predTail);
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static
|
|
void __arm_2d_impl_gray8_get_pixel_colour_with_alpha(arm_2d_point_s16x8_t * ptPoint,
|
|
arm_2d_region_t *
|
|
ptOrigValidRegion,
|
|
uint8_t * pOrigin,
|
|
int16_t iOrigStride,
|
|
uint8_t * pTarget,
|
|
uint8_t MaskColour,
|
|
uint8_t chOpacity, uint32_t elts)
|
|
{
|
|
mve_pred16_t predTail = vctp16q(elts);
|
|
uint16x8_t vTarget = vldrbq_u16(pTarget);
|
|
|
|
#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__
|
|
int16x8_t vOne = vdupq_n_s16(SET_Q6INT(1));
|
|
int16x8_t vXi = GET_Q6INT(ptPoint->X);
|
|
int16x8_t vYi = GET_Q6INT(ptPoint->Y);
|
|
|
|
vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_s16(ptPoint->X, 0));
|
|
vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_s16(ptPoint->Y, 0));
|
|
|
|
int16x8_t vWX = ptPoint->X - SET_Q6INT(vXi);
|
|
int16x8_t vWY = ptPoint->Y - SET_Q6INT(vYi);
|
|
|
|
/* combination of Bottom / Top & Left / Right areas contributions */
|
|
uint16x8_t vAreaTR = vmulq_u16(vWX, vWY);
|
|
uint16x8_t vAreaTL = vmulq_u16((vOne - vWX), vWY);
|
|
uint16x8_t vAreaBR = vmulq_u16(vWX, (vOne - vWY));
|
|
uint16x8_t vAreaBL = vmulq_u16((vOne - vWX), (vOne - vWY));
|
|
|
|
/* Q16 conversion */
|
|
vAreaTR = vqshlq_n_u16(vAreaTR, 4);
|
|
vAreaTL = vqshlq_n_u16(vAreaTL, 4);
|
|
vAreaBR = vqshlq_n_u16(vAreaBR, 4);
|
|
vAreaBL = vqshlq_n_u16(vAreaBL, 4);
|
|
|
|
|
|
/* accumulated pixel vectors */
|
|
uint16x8_t vAvgPixel;
|
|
|
|
/* predicate accumulator */
|
|
/* tracks all predications conditions for selecting final */
|
|
/* averaged pixed / target pixel */
|
|
/* can probably optimized away since averaged target pixel is */
|
|
/* equivalent to original pixel, but precision limitations would introduce small errors */
|
|
mve_pred16_t predGlb = 0;
|
|
|
|
/*
|
|
* accumulate / average over the 4 neigbouring pixels
|
|
*/
|
|
|
|
uint16x8_t ptVal8;
|
|
/* Bottom Left averaging */
|
|
{
|
|
__ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vXi, vYi, ptVal8, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_GRAY8VEC(vAvgPixel, ptVal8, vAreaBL);
|
|
}
|
|
|
|
|
|
/* Bottom Right averaging */
|
|
{
|
|
__ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vYi, ptVal8, MaskColour,
|
|
predGlb);
|
|
|
|
__ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaBR);
|
|
}
|
|
|
|
/* Top Left averaging */
|
|
{
|
|
__ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vXi, vaddq_n_s16(vYi, 1), ptVal8, MaskColour,
|
|
predGlb);
|
|
|
|
__ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaTL);
|
|
}
|
|
|
|
/* Top Right averaging */
|
|
{
|
|
__ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
|
|
ptVal8, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaTR);
|
|
}
|
|
|
|
|
|
|
|
/* alpha blending */
|
|
uint16_t chTransparency = 256 - (chOpacity);
|
|
uint16x8_t vBlended = (vAvgPixel * chTransparency + vTarget * chOpacity) >> 8;
|
|
|
|
|
|
/* select between target pixel, averaged pixed */
|
|
vTarget = vpselq_u16(vBlended, vTarget, predGlb);
|
|
|
|
#else
|
|
/* extract integer part */
|
|
arm_2d_point_s16x8_t vPoint = {
|
|
.X = GET_Q6INT(ptPoint->X),
|
|
.Y = GET_Q6INT(ptPoint->Y)
|
|
};
|
|
|
|
/* set vector predicate if point is inside the region */
|
|
mve_pred16_t p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);
|
|
/* prepare vector of point offsets */
|
|
/* correctionOffset avoid 16-bit overflow */
|
|
int16_t correctionOffset = vminvq_s16(0x7fff, vPoint.Y) - 1;
|
|
uint16x8_t ptOffs = vPoint.X + (vPoint.Y - correctionOffset) * iOrigStride;
|
|
|
|
/* retrieve all point values */
|
|
/* base pointer update to compensate offset */
|
|
pOrigin += (correctionOffset * iOrigStride);
|
|
|
|
uint16x8_t ptVal = vldrbq_gather_offset_z_u16(pOrigin, ptOffs, predTail);
|
|
|
|
/* alpha blending */
|
|
uint16_t chTransparency = 256 - (chOpacity);
|
|
uint16x8_t vBlended = (ptVal * chTransparency + vTarget * chOpacity) >> 8;
|
|
|
|
/* combine 2 predicates, set to true, if point is in the region & values different from color mask */
|
|
vTarget = vpselq_u16(vBlended, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));
|
|
|
|
#endif
|
|
|
|
vstrbq_p_u16(pTarget, vTarget, predTail);
|
|
}
|
|
|
|
|
|
static
|
|
void __arm_2d_impl_rgb565_get_pixel_colour( arm_2d_point_s16x8_t *ptPoint,
|
|
arm_2d_region_t *ptOrigValidRegion,
|
|
uint16_t *pOrigin,
|
|
int16_t iOrigStride,
|
|
uint16_t *pTarget,
|
|
uint16_t MaskColour,
|
|
uint32_t elts)
|
|
{
|
|
mve_pred16_t predTail = vctp16q(elts);
|
|
uint16x8_t vTarget = vld1q(pTarget);
|
|
|
|
#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__
|
|
int16x8_t vOne = vdupq_n_s16(SET_Q6INT(1));
|
|
int16x8_t vXi = GET_Q6INT(ptPoint->X);
|
|
int16x8_t vYi = GET_Q6INT(ptPoint->Y);
|
|
|
|
vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_s16(ptPoint->X, 0));
|
|
vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_s16(ptPoint->Y, 0));
|
|
|
|
int16x8_t vWX = ptPoint->X - SET_Q6INT(vXi);
|
|
int16x8_t vWY = ptPoint->Y - SET_Q6INT(vYi);
|
|
|
|
/* combination of Bottom / Top & Left / Right areas contributions */
|
|
uint16x8_t vAreaTR = vmulq_u16(vWX, vWY);
|
|
uint16x8_t vAreaTL = vmulq_u16((vOne - vWX), vWY);
|
|
uint16x8_t vAreaBR = vmulq_u16(vWX, (vOne - vWY));
|
|
uint16x8_t vAreaBL = vmulq_u16((vOne - vWX), (vOne - vWY));
|
|
|
|
/* Q16 conversion */
|
|
vAreaTR = vqshlq_n_u16(vAreaTR, 4);
|
|
vAreaTL = vqshlq_n_u16(vAreaTL, 4);
|
|
vAreaBR = vqshlq_n_u16(vAreaBR, 4);
|
|
vAreaBL = vqshlq_n_u16(vAreaBL, 4);
|
|
|
|
|
|
/* accumulated pixel vectors */
|
|
uint16x8_t vAvgPixelR, vAvgPixelG, vAvgPixelB;
|
|
|
|
/* predicate accumulator */
|
|
/* tracks all predications conditions for selecting final */
|
|
/* averaged pixed / target pixel */
|
|
/* can probably optimized away since averaged target pixel is */
|
|
/* equivalent to original pixel, but precision limitations would introduce small errors */
|
|
mve_pred16_t predGlb = 0;
|
|
|
|
/*
|
|
* accumulate / average over the 4 neigbouring pixels
|
|
*/
|
|
|
|
uint16x8_t R, G, B;
|
|
/* Bottom Left averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vXi, vYi, R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
|
|
}
|
|
|
|
|
|
/* Bottom Right averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vYi, R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
|
|
}
|
|
|
|
/* Top Left averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vXi, vaddq_n_s16(vYi, 1), R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
|
|
}
|
|
|
|
/* Top Right averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
|
|
R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
|
|
}
|
|
|
|
/* pack */
|
|
uint16x8_t TempPixel = __arm_2d_rgb565_pack_single_vec(vAvgPixelR,
|
|
vAvgPixelG,
|
|
vAvgPixelB);
|
|
|
|
/* select between target pixel, averaged pixed */
|
|
vTarget = vpselq_u16(TempPixel, vTarget, predGlb);
|
|
|
|
|
|
#else
|
|
/* extract integer part */
|
|
arm_2d_point_s16x8_t vPoint = {
|
|
.X = GET_Q6INT(ptPoint->X),
|
|
.Y = GET_Q6INT(ptPoint->Y)};
|
|
|
|
/* set vector predicate if point is inside the region */
|
|
mve_pred16_t p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);
|
|
/* prepare vector of point offsets */
|
|
uint16x8_t ptOffs = vPoint.X + vPoint.Y * iOrigStride;
|
|
/* retrieve all point values */
|
|
uint16x8_t ptVal = vldrhq_gather_shifted_offset_z_u16(pOrigin, ptOffs, predTail);
|
|
|
|
/* combine 2 predicates set to true if point is in the region & values different from color mask */
|
|
vTarget = vpselq_u16(ptVal, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));
|
|
|
|
#endif
|
|
vst1q_p(pTarget, vTarget, predTail);
|
|
}
|
|
|
|
static
|
|
void __arm_2d_impl_rgb565_get_pixel_colour_offs_compensated( arm_2d_point_s16x8_t *ptPoint,
|
|
arm_2d_region_t *ptOrigValidRegion,
|
|
uint16_t *pOrigin,
|
|
int16_t iOrigStride,
|
|
uint16_t *pTarget,
|
|
uint16_t MaskColour,
|
|
uint32_t elts)
|
|
{
|
|
mve_pred16_t predTail = vctp16q(elts);
|
|
uint16x8_t vTarget = vld1q(pTarget);
|
|
|
|
#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__
|
|
int16x8_t vOne = vdupq_n_s16(SET_Q6INT(1));
|
|
int16x8_t vXi = GET_Q6INT(ptPoint->X);
|
|
int16x8_t vYi = GET_Q6INT(ptPoint->Y);
|
|
|
|
vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_s16(ptPoint->X, 0));
|
|
vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_s16(ptPoint->Y, 0));
|
|
|
|
int16x8_t vWX = ptPoint->X - SET_Q6INT(vXi);
|
|
int16x8_t vWY = ptPoint->Y - SET_Q6INT(vYi);
|
|
|
|
/* combination of Bottom / Top & Left / Right areas contributions */
|
|
uint16x8_t vAreaTR = vmulq_u16(vWX, vWY);
|
|
uint16x8_t vAreaTL = vmulq_u16((vOne - vWX), vWY);
|
|
uint16x8_t vAreaBR = vmulq_u16(vWX, (vOne - vWY));
|
|
uint16x8_t vAreaBL = vmulq_u16((vOne - vWX), (vOne - vWY));
|
|
|
|
/* Q16 conversion */
|
|
vAreaTR = vqshlq_n_u16(vAreaTR, 4);
|
|
vAreaTL = vqshlq_n_u16(vAreaTL, 4);
|
|
vAreaBR = vqshlq_n_u16(vAreaBR, 4);
|
|
vAreaBL = vqshlq_n_u16(vAreaBL, 4);
|
|
|
|
|
|
/* accumulated pixel vectors */
|
|
uint16x8_t vAvgPixelR, vAvgPixelG, vAvgPixelB;
|
|
|
|
/* predicate accumulator */
|
|
/* tracks all predications conditions for selecting final */
|
|
/* averaged pixed / target pixel */
|
|
/* can probably optimized away since averaged target pixel is */
|
|
/* equivalent to original pixel, but precision limitations would introduce small errors */
|
|
mve_pred16_t predGlb = 0;
|
|
|
|
/*
|
|
* accumulate / average over the 4 neigbouring pixels
|
|
*/
|
|
|
|
uint16x8_t R, G, B;
|
|
/* Bottom Left averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vXi, vYi, R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
|
|
}
|
|
|
|
|
|
/* Bottom Right averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vYi, R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
|
|
}
|
|
|
|
/* Top Left averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vXi, vaddq_n_s16(vYi, 1), R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
|
|
}
|
|
|
|
/* Top Right averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
|
|
R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
|
|
}
|
|
|
|
/* pack */
|
|
uint16x8_t TempPixel = __arm_2d_rgb565_pack_single_vec(vAvgPixelR,
|
|
vAvgPixelG,
|
|
vAvgPixelB);
|
|
|
|
/* select between target pixel, averaged pixed */
|
|
vTarget = vpselq_u16(TempPixel, vTarget, predGlb);
|
|
|
|
|
|
#else
|
|
/* extract integer part */
|
|
arm_2d_point_s16x8_t vPoint = {
|
|
.X = GET_Q6INT(ptPoint->X),
|
|
.Y = GET_Q6INT(ptPoint->Y)};
|
|
|
|
/* set vector predicate if point is inside the region */
|
|
mve_pred16_t p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);
|
|
|
|
/* prepare vector of point offsets */
|
|
/* correctionOffset avoid 16-bit overflow */
|
|
int16_t correctionOffset = vminvq_s16(0x7fff, vPoint.Y) - 1;
|
|
uint16x8_t ptOffs =
|
|
vPoint.X + (vPoint.Y - correctionOffset) * iOrigStride;
|
|
|
|
/* base pointer update to compensate offset */
|
|
pOrigin += (correctionOffset * iOrigStride);
|
|
|
|
/* retrieve all point values */
|
|
uint16x8_t ptVal = vldrhq_gather_shifted_offset_z_u16(pOrigin, ptOffs, predTail);
|
|
|
|
/* combine 2 predicates set to true if point is in the region & values different from color mask */
|
|
vTarget = vpselq_u16(ptVal, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));
|
|
#endif
|
|
|
|
vst1q_p(pTarget, vTarget, predTail);
|
|
}
|
|
|
|
|
|
|
|
static
|
|
void __arm_2d_impl_rgb565_get_pixel_colour_with_alpha(
|
|
arm_2d_point_s16x8_t *ptPoint,
|
|
arm_2d_region_t *ptOrigValidRegion,
|
|
uint16_t *pOrigin,
|
|
int16_t iOrigStride,
|
|
uint16_t *pTarget,
|
|
uint16_t MaskColour,
|
|
uint8_t chOpacity,
|
|
uint32_t elts)
|
|
{
|
|
mve_pred16_t predTail = vctp16q(elts);
|
|
uint16x8_t vTarget = vld1q(pTarget);
|
|
|
|
#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__
|
|
int16x8_t vOne = vdupq_n_s16(SET_Q6INT(1));
|
|
int16x8_t vXi = GET_Q6INT(ptPoint->X);
|
|
int16x8_t vYi = GET_Q6INT(ptPoint->Y);
|
|
|
|
vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_s16(ptPoint->X, 0));
|
|
vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_s16(ptPoint->Y, 0));
|
|
|
|
int16x8_t vWX = ptPoint->X - SET_Q6INT(vXi);
|
|
int16x8_t vWY = ptPoint->Y - SET_Q6INT(vYi);
|
|
|
|
/* combination of Bottom / Top & Left / Right areas contributions */
|
|
uint16x8_t vAreaTR = vmulq_u16(vWX, vWY);
|
|
uint16x8_t vAreaTL = vmulq_u16((vOne - vWX), vWY);
|
|
uint16x8_t vAreaBR = vmulq_u16(vWX, (vOne - vWY));
|
|
uint16x8_t vAreaBL = vmulq_u16((vOne - vWX), (vOne - vWY));
|
|
|
|
/* Q16 conversion */
|
|
vAreaTR = vqshlq_n_u16(vAreaTR, 4);
|
|
vAreaTL = vqshlq_n_u16(vAreaTL, 4);
|
|
vAreaBR = vqshlq_n_u16(vAreaBR, 4);
|
|
vAreaBL = vqshlq_n_u16(vAreaBL, 4);
|
|
|
|
|
|
/* accumulated pixel vectors */
|
|
uint16x8_t vAvgPixelR, vAvgPixelG, vAvgPixelB;
|
|
|
|
/* predicate accumulator */
|
|
/* tracks all predications conditions for selecting final */
|
|
/* averaged pixed / target pixel */
|
|
/* can probably optimized away since averaged target pixel is */
|
|
/* equivalent to original pixel, but precision limitations would introduce small errors */
|
|
mve_pred16_t predGlb = 0;
|
|
|
|
/*
|
|
* accumulate / average over the 4 neigbouring pixels
|
|
*/
|
|
|
|
uint16x8_t R, G, B;
|
|
/* Bottom Left averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vXi, vYi, R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
|
|
}
|
|
|
|
|
|
/* Bottom Right averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vYi, R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
|
|
}
|
|
|
|
/* Top Left averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vXi, vaddq_n_s16(vYi, 1), R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
|
|
}
|
|
|
|
/* Top Right averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
|
|
R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
|
|
}
|
|
|
|
|
|
/* alpha blending */
|
|
uint16x8_t vBlended;
|
|
|
|
__ARM_2D_BLEND_RGB565_TARGET_RGBVEC(chOpacity, vTarget, vAvgPixelR, vAvgPixelG, vAvgPixelB, vBlended);
|
|
|
|
|
|
/* select between target pixel, averaged pixed */
|
|
vTarget = vpselq_u16(vBlended, vTarget, predGlb);
|
|
|
|
#else
|
|
/* extract integer part */
|
|
arm_2d_point_s16x8_t vPoint = {
|
|
.X = GET_Q6INT(ptPoint->X),
|
|
.Y = GET_Q6INT(ptPoint->Y)};
|
|
|
|
/* set vector predicate if point is inside the region */
|
|
mve_pred16_t p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);
|
|
/* prepare vector of point offsets */
|
|
uint16x8_t ptOffs = vPoint.X + vPoint.Y * iOrigStride;
|
|
/* retrieve all point values */
|
|
uint16x8_t ptVal = vldrhq_gather_shifted_offset_u16(pOrigin, ptOffs);
|
|
|
|
/* alpha blending */
|
|
uint16x8_t vBlended =
|
|
__arm_2d_rgb565_alpha_blending_single_vec(ptVal, vTarget, chOpacity);
|
|
|
|
/* combine 2 predicates, set to true, if point is in the region & values different from color mask */
|
|
vTarget = vpselq_u16(vBlended, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));
|
|
#endif
|
|
|
|
vst1q_p(pTarget, vTarget, predTail);
|
|
}
|
|
|
|
|
|
static
|
|
void __arm_2d_impl_rgb565_get_pixel_colour_with_alpha_offs_compensated(
|
|
arm_2d_point_s16x8_t *ptPoint,
|
|
arm_2d_region_t *ptOrigValidRegion,
|
|
uint16_t *pOrigin,
|
|
int16_t iOrigStride,
|
|
uint16_t *pTarget,
|
|
uint16_t MaskColour,
|
|
uint8_t chOpacity,
|
|
uint32_t elts)
|
|
{
|
|
mve_pred16_t predTail = vctp16q(elts);
|
|
uint16x8_t vTarget = vld1q(pTarget);
|
|
|
|
#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__
|
|
int16x8_t vOne = vdupq_n_s16(SET_Q6INT(1));
|
|
int16x8_t vXi = GET_Q6INT(ptPoint->X);
|
|
int16x8_t vYi = GET_Q6INT(ptPoint->Y);
|
|
|
|
vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_s16(ptPoint->X, 0));
|
|
vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_s16(ptPoint->Y, 0));
|
|
|
|
int16x8_t vWX = ptPoint->X - SET_Q6INT(vXi);
|
|
int16x8_t vWY = ptPoint->Y - SET_Q6INT(vYi);
|
|
|
|
/* combination of Bottom / Top & Left / Right areas contributions */
|
|
uint16x8_t vAreaTR = vmulq_u16(vWX, vWY);
|
|
uint16x8_t vAreaTL = vmulq_u16((vOne - vWX), vWY);
|
|
uint16x8_t vAreaBR = vmulq_u16(vWX, (vOne - vWY));
|
|
uint16x8_t vAreaBL = vmulq_u16((vOne - vWX), (vOne - vWY));
|
|
|
|
/* Q16 conversion */
|
|
vAreaTR = vqshlq_n_u16(vAreaTR, 4);
|
|
vAreaTL = vqshlq_n_u16(vAreaTL, 4);
|
|
vAreaBR = vqshlq_n_u16(vAreaBR, 4);
|
|
vAreaBL = vqshlq_n_u16(vAreaBL, 4);
|
|
|
|
|
|
/* accumulated pixel vectors */
|
|
uint16x8_t vAvgPixelR, vAvgPixelG, vAvgPixelB;
|
|
|
|
/* predicate accumulator */
|
|
/* tracks all predications conditions for selecting final */
|
|
/* averaged pixed / target pixel */
|
|
/* can probably optimized away since averaged target pixel is */
|
|
/* equivalent to original pixel, but precision limitations would introduce small errors */
|
|
mve_pred16_t predGlb = 0;
|
|
|
|
/*
|
|
* accumulate / average over the 4 neigbouring pixels
|
|
*/
|
|
|
|
uint16x8_t R, G, B;
|
|
/* Bottom Left averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vXi, vYi, R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
|
|
}
|
|
|
|
|
|
/* Bottom Right averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vYi, R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
|
|
}
|
|
|
|
/* Top Left averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vXi, vaddq_n_s16(vYi, 1), R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
|
|
}
|
|
|
|
/* Top Right averaging */
|
|
{
|
|
__ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
|
|
R, G, B, MaskColour, predGlb);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
|
|
}
|
|
|
|
/* alpha blending */
|
|
uint16x8_t vBlended;
|
|
|
|
__ARM_2D_BLEND_RGB565_TARGET_RGBVEC(chOpacity, vTarget, vAvgPixelR, vAvgPixelG, vAvgPixelB, vBlended);
|
|
|
|
|
|
|
|
/* select between target pixel, averaged pixed */
|
|
vTarget = vpselq_u16(vBlended, vTarget, predGlb);
|
|
|
|
#else
|
|
/* extract integer part */
|
|
arm_2d_point_s16x8_t vPoint = {
|
|
.X = GET_Q6INT(ptPoint->X),
|
|
.Y = GET_Q6INT(ptPoint->Y)};
|
|
|
|
/* set vector predicate if point is inside the region */
|
|
mve_pred16_t p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);
|
|
/* prepare vector of point offsets */
|
|
/* correctionOffset avoid 16-bit overflow */
|
|
int16_t correctionOffset = vminvq_s16(0x7fff, vPoint.Y) - 1;
|
|
uint16x8_t ptOffs =
|
|
vPoint.X + (vPoint.Y - correctionOffset) * iOrigStride;
|
|
|
|
/* retrieve all point values */
|
|
/* base pointer update to compensate offset */
|
|
pOrigin += (correctionOffset * iOrigStride);
|
|
|
|
uint16x8_t ptVal = vldrhq_gather_shifted_offset_z_u16(pOrigin, ptOffs, predTail);
|
|
|
|
/* alpha blending */
|
|
uint16x8_t vBlended =
|
|
__arm_2d_rgb565_alpha_blending_single_vec(ptVal, vTarget, chOpacity);
|
|
|
|
/* combine 2 predicates, set to true, if point is in the region & values different from color mask */
|
|
vTarget = vpselq_u16(vBlended, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));
|
|
|
|
#endif
|
|
|
|
vst1q_p(pTarget, vTarget, predTail);
|
|
}
|
|
|
|
static
|
|
void __arm_2d_impl_cccn888_get_pixel_colour( arm_2d_point_s16x8_t *ptPoint,
|
|
arm_2d_region_t *ptOrigValidRegion,
|
|
uint32_t *pOrigin,
|
|
int16_t iOrigStride,
|
|
uint32_t *pTarget,
|
|
uint32_t MaskColour,
|
|
int16_t elts)
|
|
{
|
|
#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__
|
|
ARM_ALIGN(8) uint32_t scratch32[32];
|
|
int16_t *pscratch16 = (int16_t *) scratch32;
|
|
uint32x4_t vTargetLo = vld1q(pTarget);
|
|
uint32x4_t vTargetHi = vld1q(pTarget + 4);
|
|
mve_pred16_t predTailLow = vctp32q(elts);
|
|
mve_pred16_t predTailHigh = elts - 4 > 0 ? vctp32q(elts - 4) : 0;
|
|
int16x8_t vOne = vdupq_n_s16(SET_Q6INT(1));
|
|
|
|
int16x8_t vXi = GET_Q6INT(ptPoint->X);
|
|
int16x8_t vYi = GET_Q6INT(ptPoint->Y);
|
|
|
|
vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_s16(ptPoint->X, 0));
|
|
vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_s16(ptPoint->Y, 0));
|
|
|
|
int16x8_t vWX = ptPoint->X - (SET_Q6INT(vXi));
|
|
int16x8_t vWY = ptPoint->Y - (SET_Q6INT(vYi));
|
|
|
|
/* combination of Bottom / Top & Left / Right areas contributions */
|
|
uint16x8_t vAreaTR = vmulq_u16(vWX, vWY);
|
|
uint16x8_t vAreaTL = vmulq_u16((vOne - vWX), vWY);
|
|
uint16x8_t vAreaBR = vmulq_u16(vWX, (vOne - vWY));
|
|
uint16x8_t vAreaBL = vmulq_u16((vOne - vWX), (vOne - vWY));
|
|
|
|
/* Q16 conversion */
|
|
vAreaTR = vqshlq_n_u16(vAreaTR, 4);
|
|
vAreaTL = vqshlq_n_u16(vAreaTL, 4);
|
|
vAreaBR = vqshlq_n_u16(vAreaBR, 4);
|
|
vAreaBL = vqshlq_n_u16(vAreaBL, 4);
|
|
|
|
|
|
/* accumulated pixel vectors */
|
|
uint16x8_t vAvgPixelR, vAvgPixelG, vAvgPixelB;
|
|
|
|
/* predicate accumulators */
|
|
/* tracks all predications conditions for selecting final */
|
|
/* averaged pixed / target pixel */
|
|
/* can probably optimized away since averaged target pixel is */
|
|
/* equivalent to original pixel, but precision limitations would introduce small errors */
|
|
mve_pred16_t predGlbLo = 0, predGlbHi = 0;
|
|
|
|
/*
|
|
* accumulate / average over the 4 neigbouring pixels
|
|
*/
|
|
|
|
uint16x8_t R, G, B;
|
|
|
|
/* Bottom Left averaging */
|
|
{
|
|
__ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vXi, vYi, R, G, B, MaskColour, predGlbLo, predGlbHi);
|
|
|
|
__ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
|
|
}
|
|
|
|
/* Bottom Right averaging */
|
|
{
|
|
__ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vYi,
|
|
R, G, B, MaskColour, predGlbLo, predGlbHi);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
|
|
}
|
|
|
|
/* Top Left averaging */
|
|
{
|
|
__ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vXi, vaddq_n_s16(vYi, 1),
|
|
R, G, B, MaskColour, predGlbLo, predGlbHi);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
|
|
}
|
|
|
|
/* Top Right averaging */
|
|
{
|
|
__ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
|
|
R, G, B, MaskColour, predGlbLo, predGlbHi);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
|
|
}
|
|
|
|
|
|
/* pack */
|
|
__arm_2d_pack_rgb888_to_mem((uint8_t *) scratch32, vAvgPixelR, vAvgPixelG, vAvgPixelB);
|
|
|
|
uint32x4_t TempPixel = vld1q(scratch32);
|
|
|
|
/* select between target pixel, averaged pixed */
|
|
TempPixel = vpselq_u32(TempPixel, vTargetLo, predGlbLo);
|
|
|
|
vst1q_p(pTarget, TempPixel, predTailLow);
|
|
|
|
TempPixel = vld1q(scratch32 + 4);
|
|
|
|
/* select between target pixel, averaged pixed */
|
|
TempPixel = vpselq_u32(TempPixel, vTargetHi, predGlbHi);
|
|
|
|
vst1q_p(pTarget + 4, TempPixel, predTailHigh);
|
|
#else
|
|
|
|
arm_2d_point_s32x4_t tPointLo, tPointHi;
|
|
ARM_ALIGN(8) int16_t scratch[8];
|
|
mve_pred16_t p;
|
|
|
|
/* split 16-bit point vector into 2 x 32-bit vectors */
|
|
vst1q(scratch, GET_Q6INT(ptPoint->X));
|
|
tPointLo.X = vldrhq_s32(scratch);
|
|
tPointHi.X = vldrhq_s32(scratch + 4);
|
|
|
|
vst1q(scratch, GET_Q6INT(ptPoint->Y));
|
|
tPointLo.Y = vldrhq_s32(scratch);
|
|
tPointHi.Y = vldrhq_s32(scratch + 4);
|
|
|
|
/* 1st half */
|
|
|
|
/* set vector predicate if point is inside the region */
|
|
p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointLo);
|
|
/* prepare vector of point offsets */
|
|
uint32x4_t ptOffs = tPointLo.X + tPointLo.Y * iOrigStride;
|
|
uint32x4_t vPixel = vld1q(pTarget);
|
|
/* retrieve all point values */
|
|
uint32x4_t ptVal = vldrwq_gather_shifted_offset_u32(pOrigin, ptOffs);
|
|
|
|
/* combine 2 predicates set to true if point is in the region & values different from color mask */
|
|
vPixel = vpselq_u32(ptVal, vPixel, vcmpneq_m_n_u32(ptVal, MaskColour, p));
|
|
|
|
vst1q_p(pTarget, vPixel, vctp32q(elts));
|
|
|
|
elts -= 4;
|
|
if (elts > 0) {
|
|
|
|
/* second half */
|
|
p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointHi);
|
|
ptOffs = tPointHi.X + tPointHi.Y * iOrigStride;
|
|
vPixel = vld1q(pTarget + 4);
|
|
|
|
ptVal = vldrwq_gather_shifted_offset_u32(pOrigin, ptOffs);
|
|
vPixel = vpselq_u32(ptVal, vPixel, vcmpneq_m_n_u32(ptVal, MaskColour, p));
|
|
vst1q_p(pTarget + 4, vPixel, vctp32q(elts));
|
|
}
|
|
#endif
|
|
}
|
|
|
|
static
|
|
void __arm_2d_impl_cccn888_get_pixel_colour_with_alpha(
|
|
arm_2d_point_s16x8_t *ptPoint,
|
|
arm_2d_region_t *ptOrigValidRegion,
|
|
uint32_t *pOrigin,
|
|
int16_t iOrigStride,
|
|
uint32_t *pTarget,
|
|
uint32_t MaskColour,
|
|
uint8_t chOpacity,
|
|
int16_t elts)
|
|
{
|
|
#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) && __ARM_2D_HAS_INTERPOLATION_ROTATION__
|
|
ARM_ALIGN(8) uint32_t scratch32[32];
|
|
int16_t *pscratch16 = (int16_t *) scratch32;
|
|
uint32x4_t vTargetLo = vld1q(pTarget);
|
|
uint32x4_t vTargetHi = vld1q(pTarget + 4);
|
|
mve_pred16_t predTailLow = vctp32q(elts);
|
|
mve_pred16_t predTailHigh = elts - 4 > 0 ? vctp32q(elts - 4) : 0;
|
|
int16x8_t vOne = vdupq_n_s16(SET_Q6INT(1));
|
|
|
|
int16x8_t vXi = GET_Q6INT(ptPoint->X);
|
|
int16x8_t vYi = GET_Q6INT(ptPoint->Y);
|
|
|
|
vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_s16(ptPoint->X, 0));
|
|
vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_s16(ptPoint->Y, 0));
|
|
|
|
int16x8_t vWX = ptPoint->X - (SET_Q6INT(vXi));
|
|
int16x8_t vWY = ptPoint->Y - (SET_Q6INT(vYi));
|
|
|
|
/* combination of Bottom / Top & Left / Right areas contributions */
|
|
uint16x8_t vAreaTR = vmulq_u16(vWX, vWY);
|
|
uint16x8_t vAreaTL = vmulq_u16((vOne - vWX), vWY);
|
|
uint16x8_t vAreaBR = vmulq_u16(vWX, (vOne - vWY));
|
|
uint16x8_t vAreaBL = vmulq_u16((vOne - vWX), (vOne - vWY));
|
|
|
|
/* Q16 conversion */
|
|
vAreaTR = vqshlq_n_u16(vAreaTR, 4);
|
|
vAreaTL = vqshlq_n_u16(vAreaTL, 4);
|
|
vAreaBR = vqshlq_n_u16(vAreaBR, 4);
|
|
vAreaBL = vqshlq_n_u16(vAreaBL, 4);
|
|
|
|
|
|
/* accumulated pixel vectors */
|
|
uint16x8_t vAvgPixelR, vAvgPixelG, vAvgPixelB;
|
|
|
|
/* predicate accumulators */
|
|
/* tracks all predications conditions for selecting final */
|
|
/* averaged pixed / target pixel */
|
|
/* can probably optimized away since averaged target pixel is */
|
|
/* equivalent to original pixel, but precision limitations would introduce small errors */
|
|
mve_pred16_t predGlbLo = 0, predGlbHi = 0;
|
|
|
|
/*
|
|
* accumulate / average over the 4 neigbouring pixels
|
|
*/
|
|
|
|
uint16x8_t R, G, B;
|
|
|
|
/* Bottom Left averaging */
|
|
{
|
|
__ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vXi, vYi, R, G, B, MaskColour, predGlbLo, predGlbHi);
|
|
|
|
__ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
|
|
}
|
|
|
|
/* Bottom Right averaging */
|
|
{
|
|
__ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vYi,
|
|
R, G, B, MaskColour, predGlbLo, predGlbHi);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
|
|
}
|
|
|
|
/* Top Left averaging */
|
|
{
|
|
__ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vXi, vaddq_n_s16(vYi, 1),
|
|
R, G, B, MaskColour, predGlbLo, predGlbHi);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
|
|
}
|
|
|
|
/* Top Right averaging */
|
|
{
|
|
__ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
|
|
R, G, B, MaskColour, predGlbLo, predGlbHi);
|
|
|
|
__ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
|
|
}
|
|
|
|
/* alpha blending */
|
|
uint16x8_t vTargetR, vTargetG, vTargetB;
|
|
|
|
__arm_2d_unpack_rgb888_from_mem((const uint8_t *) pTarget, &vTargetR, &vTargetG, &vTargetB);
|
|
|
|
uint16_t chOpacityCompl = (256 - chOpacity);
|
|
|
|
/* merge */
|
|
vAvgPixelR = vmlaq_n_u16(vmulq_n_u16(vAvgPixelR, chOpacityCompl), vTargetR, chOpacity);
|
|
vAvgPixelR = vAvgPixelR >> 8;
|
|
|
|
vAvgPixelG = vmlaq_n_u16(vmulq_n_u16(vAvgPixelG, chOpacityCompl), vTargetG, chOpacity);
|
|
vAvgPixelG = vAvgPixelG >> 8;
|
|
|
|
vAvgPixelB = vmlaq_n_u16(vmulq_n_u16(vAvgPixelB, chOpacityCompl), vTargetB, chOpacity);
|
|
vAvgPixelB = vAvgPixelB >> 8;
|
|
|
|
/* pack */
|
|
__arm_2d_pack_rgb888_to_mem((uint8_t *) scratch32, vAvgPixelR, vAvgPixelG, vAvgPixelB);
|
|
|
|
uint32x4_t TempPixel = vld1q(scratch32);
|
|
|
|
/* select between target pixel, averaged pixed */
|
|
TempPixel = vpselq_u32(TempPixel, vTargetLo, predGlbLo);
|
|
|
|
vst1q_p(pTarget, TempPixel, predTailLow);
|
|
|
|
TempPixel = vld1q(scratch32 + 4);
|
|
|
|
/* select between target pixel, averaged pixed */
|
|
TempPixel = vpselq_u32(TempPixel, vTargetHi, predGlbHi);
|
|
|
|
vst1q_p(pTarget + 4, TempPixel, predTailHigh);
|
|
#else
|
|
arm_2d_point_s32x4_t tPointLo, tPointHi;
|
|
ARM_ALIGN(8) int16_t scratch[8];
|
|
ARM_ALIGN(8) uint32_t blendled[4];
|
|
mve_pred16_t p;
|
|
|
|
/* split 16-bit point vector into 2 x 32-bit vectors */
|
|
vst1q(scratch, GET_Q6INT(ptPoint->X));
|
|
tPointLo.X = vldrhq_s32(scratch);
|
|
tPointHi.X = vldrhq_s32(scratch + 4);
|
|
|
|
vst1q(scratch, GET_Q6INT(ptPoint->Y));
|
|
tPointLo.Y = vldrhq_s32(scratch);
|
|
tPointHi.Y = vldrhq_s32(scratch + 4);
|
|
|
|
/* 1st half */
|
|
|
|
/* set vector predicate if point is inside the region */
|
|
p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointLo);
|
|
/* prepare vector of point offsets */
|
|
uint32x4_t ptOffs = tPointLo.X + tPointLo.Y * iOrigStride;
|
|
uint32x4_t vPixel = vld1q(pTarget);
|
|
/* retrieve all point values */
|
|
uint32x4_t ptVal = vldrwq_gather_shifted_offset_u32(pOrigin, ptOffs);
|
|
|
|
vstrwq_u32((uint32_t *) scratch, ptVal);
|
|
|
|
/* alpha-blending (requires widened inputs) */
|
|
vstrbq_u16((uint8_t *) blendled,
|
|
__rgb888_alpha_blending_direct_single_vec(vldrbq_u16((uint8_t const *) scratch),
|
|
vldrbq_u16((uint8_t const *) pTarget), chOpacity));
|
|
|
|
vstrbq_u16((uint8_t *) blendled + 2,
|
|
__rgb888_alpha_blending_direct_single_vec(vldrbq_u16((uint8_t const *)scratch + 4),
|
|
vldrbq_u16((uint8_t const *)pTarget + 2), chOpacity));
|
|
|
|
uint32x4_t vBlended = vld1q(blendled);
|
|
|
|
/* combine 2 predicates, set to true, if point is in the region & values different from color mask */
|
|
vPixel = vpselq_u32(vBlended, vPixel, vcmpneq_m_n_u32(ptVal, MaskColour, p));
|
|
|
|
vst1q_p(pTarget, vPixel, vctp32q(elts));
|
|
|
|
elts -= 4;
|
|
if(elts > 0) {
|
|
/* second half */
|
|
|
|
p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointHi);
|
|
ptOffs = tPointHi.X + tPointHi.Y * iOrigStride;
|
|
vPixel = vld1q(pTarget);
|
|
ptVal = vldrwq_gather_shifted_offset_u32(pOrigin, ptOffs);
|
|
|
|
vstrwq_u32((uint32_t *) scratch, ptVal);
|
|
|
|
/* alpha-blending (requires widened inputs) */
|
|
vstrbq_u16((uint8_t *) blendled,
|
|
__rgb888_alpha_blending_direct_single_vec(vldrbq_u16((uint8_t const *) scratch),
|
|
vldrbq_u16((uint8_t const *) pTarget), chOpacity));
|
|
vstrbq_u16((uint8_t *) blendled + 2,
|
|
__rgb888_alpha_blending_direct_single_vec(vldrbq_u16((uint8_t const *)scratch + 4),
|
|
vldrbq_u16((uint8_t const *)pTarget + 2), chOpacity));
|
|
|
|
vBlended = vld1q(blendled);
|
|
|
|
/* combine 2 predicates, set to true, if point is in the region & values different from color mask */
|
|
vPixel = vpselq_u32(vBlended, vPixel, vcmpneq_m_n_u32(ptVal, MaskColour, p));
|
|
|
|
vst1q_p(pTarget + 4, vPixel, vctp32q(elts));
|
|
}
|
|
#endif
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
#define __API_INT_TYPE_BIT_NUM 8
|
|
#define __API_COLOUR gray8
|
|
#include "__arm_2d_rotate_helium.inc"
|
|
|
|
#define __API_INT_TYPE_BIT_NUM 16
|
|
#define __API_COLOUR rgb565
|
|
#include "__arm_2d_rotate_helium.inc"
|
|
|
|
#define __API_INT_TYPE_BIT_NUM 32
|
|
#define __API_COLOUR cccn888
|
|
#include "__arm_2d_rotate_helium.inc"
|
|
|
|
|
|
/* rgb8_draw_pattern helpers */
|
|
|
|
/*
|
|
* enable to pick gather load offset based on initial offset
|
|
* e.g. if iOffset = 3
|
|
* will get {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2}
|
|
*/
|
|
static uint8_t __draw_pattern_src_incr_c8bit[32] = {
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
1, 1, 1, 1, 1, 1, 1, 1,
|
|
2, 2, 2, 2, 2, 2, 2, 2,
|
|
3, 3, 3, 3, 3, 3, 3, 3
|
|
};
|
|
|
|
/*
|
|
* enable to pick vector bitmask based on initial offset
|
|
* e.g. if iOffset = 3
|
|
* will get {8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4}
|
|
*/
|
|
|
|
static uint8_t __draw_pattern_src_bitmask_c8bit[32] = {
|
|
1, 2, 4, 8, 16, 32, 64, 128,
|
|
1, 2, 4, 8, 16, 32, 64, 128,
|
|
1, 2, 4, 8, 16, 32, 64, 128,
|
|
1, 2, 4, 8, 16, 32, 64, 128,
|
|
};
|
|
|
|
|
|
/* rgb16_draw_pattern helpers */
|
|
|
|
/*
|
|
* enable to pick gather load offset based on initial offset
|
|
* e.g. if iOffset = 3
|
|
* will get {0, 0, 0, 0, 0, 1, 1, 1}
|
|
*/
|
|
static uint16_t __draw_pattern_src_incr_rgb16[16] = {
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
1, 1, 1, 1, 1, 1, 1, 1
|
|
};
|
|
|
|
/*
|
|
* enable to pick vector bitmask based on initial offset
|
|
* e.g. if iOffset = 3
|
|
* will get {8, 16, 32, 64, 128, 1, 2, 4}
|
|
*/
|
|
|
|
static uint16_t __draw_pattern_src_bitmask_rgb16[16] = {
|
|
1, 2, 4, 8, 16, 32, 64, 128,
|
|
1, 2, 4, 8, 16, 32, 64, 128,
|
|
};
|
|
|
|
|
|
/* rgb32_draw_pattern helpers */
|
|
|
|
static uint32_t __draw_pattern_src_incr_rgb32[16] = {
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
1, 1, 1, 1, 1, 1, 1, 1
|
|
};
|
|
|
|
|
|
static uint32_t __draw_pattern_src_bitmask_rgb32[16] = {
|
|
1, 2, 4, 8, 16, 32, 64, 128,
|
|
1, 2, 4, 8, 16, 32, 64, 128,
|
|
};
|
|
|
|
|
|
|
|
/*! adding support with c code template */
|
|
#define __API_COLOUR c8bit
|
|
#define __API_ELT_SZ 8
|
|
#include "__arm_2d_draw_pattern_helium.inc"
|
|
|
|
|
|
#define __API_COLOUR rgb16
|
|
#define __API_ELT_SZ 16
|
|
#include "__arm_2d_draw_pattern_helium.inc"
|
|
|
|
|
|
#define __API_COLOUR rgb32
|
|
#define __API_ELT_SZ 32
|
|
#include "__arm_2d_draw_pattern_helium.inc"
|
|
|
|
|
|
|
|
#define __API_COLOUR c8bit
|
|
#define __API_ELT_SZ 8
|
|
#include "__arm_2d_fill_colour_helium.inc"
|
|
|
|
|
|
#define __API_COLOUR rgb16
|
|
#define __API_ELT_SZ 16
|
|
#include "__arm_2d_fill_colour_helium.inc"
|
|
|
|
|
|
#define __API_COLOUR rgb32
|
|
#define __API_ELT_SZ 32
|
|
#include "__arm_2d_fill_colour_helium.inc"
|
|
|
|
|
|
/**
|
|
8-bit pixel color fill alpha/channel mask with/without opacity MVE intrinsic generator
|
|
- TRGT_LOAD is a contigous / strided target load function
|
|
C8BIT_TRGT_LOAD / C8BIT_TRGT_LOAD_STRIDE
|
|
- STRIDE is an optional vector of offset for gather load
|
|
- SCAL_OPACITY is extra alpha scaling function
|
|
C8BIT_SCAL_OPACITY_NONE / C8BIT_SCAL_OPACITY
|
|
- OPACITY is an optinal 8-bit vector with duplicated opacity values
|
|
(need vector format to be used with VMULH.U8)
|
|
- ALPHA_SZ, alpha chan width (1 or 4 for resp. 8 or 32-bit type)
|
|
|
|
Macro assumes pTarget8/ pAlpha are already setup
|
|
*/
|
|
|
|
#define C8BIT_COLOUR_FILLING_MASK_INNER_MVE(TRGT_LOAD, STRIDE, SCAL_OPACITY, \
|
|
OPACITY, ALPHA_SZ) \
|
|
int32_t blkCnt = iWidth; \
|
|
do { \
|
|
mve_pred16_t tailPred = vctp16q(blkCnt); \
|
|
\
|
|
uint16x8_t vecTarget = vldrbq_z_u16(pTarget8, tailPred); \
|
|
uint16x8_t vecTransp = TRGT_LOAD(pAlpha, STRIDE, tailPred); \
|
|
\
|
|
vecTransp = SCAL_OPACITY(vecTransp, OPACITY, tailPred); \
|
|
uint16x8_t vecAlpha = vsubq_x_u16(v256, vecTransp, tailPred); \
|
|
\
|
|
vecTarget = vmulq_x(vecTarget, vecAlpha, tailPred); \
|
|
vecTarget = vmlaq_m(vecTarget, vecTransp, (uint16_t) Colour, tailPred); \
|
|
vecTarget = vecTarget >> 8; \
|
|
\
|
|
vstrbq_p_u16(pTarget8, vecTarget, tailPred); \
|
|
\
|
|
pAlpha += (8 * ALPHA_SZ); \
|
|
pTarget8 += 8; \
|
|
blkCnt -= 8; \
|
|
} \
|
|
while (blkCnt > 0);
|
|
|
|
|
|
/**
|
|
RGB565 pixel color fill alpha/channel mask with/without opacity MVE intrinsic generator
|
|
- TRGT_LOAD is a contigous / strided target load function
|
|
RGB565_TRGT_LOAD / RGB565_TRGT_LOAD_STRIDE
|
|
- STRIDE is an optional vector of offset for gather load
|
|
- SCAL_OPACITY is extra alpha scaling function
|
|
RGB565_SCAL_OPACITY_NONE / RGB565_SCAL_OPACITY
|
|
- OPACITY is an optinal 8-bit vector with duplicated opacity values
|
|
(need vector format to be used with VMULH.U8)
|
|
- P_ALPHA, 8-bit or 32-bit alpha chan. pointer
|
|
- ALPHA_SZ, alpha chan width (1 or 4 for resp. 8 or 32-bit type)
|
|
|
|
Does not generate a tail-predicated loop as relying on pack/unpack functions.
|
|
Predication is only applied in the final stage during pixel store.
|
|
*/
|
|
|
|
#define RGB565_COLOUR_FILLING_MASK_MVE(TRGT_LOAD, STRIDE, SCAL_OPACITY, OPACITY, \
|
|
P_ALPHA, ALPHA_SZ) \
|
|
uint16x8_t v256 = vdupq_n_u16(256); \
|
|
\
|
|
for (int_fast16_t y = 0; y < iHeight; y++) { \
|
|
const uint8_t *pAlpha = (const uint8_t *)P_ALPHA; \
|
|
uint16_t *pCurTarget = pTarget; \
|
|
int32_t blkCnt = iWidth; \
|
|
\
|
|
do { \
|
|
uint16x8_t vecTarget = vld1q(pCurTarget); \
|
|
uint16x8_t vecTransp = TRGT_LOAD(pAlpha, STRIDE); \
|
|
vecTransp = SCAL_OPACITY(vecTransp, OPACITY); \
|
|
uint16x8_t vecAlpha = vsubq_u16(v256, vecTransp); \
|
|
uint16x8_t vecR, vecG, vecB; \
|
|
\
|
|
__arm_2d_rgb565_unpack_single_vec(vecTarget, &vecR, &vecG, &vecB); \
|
|
\
|
|
/* blending using alpha vector weights */ \
|
|
vecR = vmulq(vecR, vecAlpha); \
|
|
vecR = vmlaq(vecR, vecTransp, (uint16_t) tSrcPix.R); \
|
|
vecR = vecR >> 8; \
|
|
\
|
|
vecG = vmulq(vecG, vecAlpha); \
|
|
vecG = vmlaq(vecG, vecTransp, (uint16_t) tSrcPix.G); \
|
|
vecG = vecG >> 8; \
|
|
\
|
|
vecB = vmulq(vecB, vecAlpha); \
|
|
vecB = vmlaq(vecB, vecTransp, (uint16_t) tSrcPix.B); \
|
|
vecB = vecB >> 8; \
|
|
\
|
|
vecTarget = __arm_2d_rgb565_pack_single_vec(vecR, vecG, vecB); \
|
|
\
|
|
/* tail predication */ \
|
|
vst1q_p_u16(pCurTarget, vecTarget, vctp16q(blkCnt)); \
|
|
\
|
|
pAlpha += (8 * ALPHA_SZ); \
|
|
pCurTarget += 8; \
|
|
blkCnt -= 8; \
|
|
} \
|
|
while (blkCnt > 0); \
|
|
\
|
|
P_ALPHA += (iAlphaStride); \
|
|
pTarget += (iTargetStride); \
|
|
}
|
|
|
|
|
|
/**
|
|
CCCN888 pixel color fill alpha/channel mask with/without opacity MVE intrinsic generator
|
|
- TRGT_LOAD is a contigous / strided target load function
|
|
CCCN888_TRGT_LOAD / CCCN888_TRGT_LOAD_STRIDE
|
|
- STRIDE is an optional vector of offset for gather load
|
|
- SCAL_OPACITY is extra alpha scaling function
|
|
CCCN888_SCAL_OPACITY_NONE / CCCN888_SCAL_OPACITY
|
|
- OPACITY is an optinal 8-bit vector with duplicated opacity values
|
|
(need vector format to be used with VMULH.U8)
|
|
- ALPHA_SZ, alpha chan width (1 or 4 for resp. 8 or 32-bit type)
|
|
|
|
Macro assumes pTargetCh0/1/2 & pAlpha are already setup
|
|
*/
|
|
|
|
#define CCCN888_COLOUR_FILLING_MASK_INNER_MVE(TRGT_LOAD, STRIDE, SCAL_OPACITY, \
|
|
OPACITY, ALPHA_SZ) \
|
|
int32_t blkCnt = iWidth; \
|
|
\
|
|
do { \
|
|
mve_pred16_t tailPred = vctp16q(blkCnt); \
|
|
\
|
|
/* expand chan0, chan1, chan2 */ \
|
|
uint16x8_t vecTargetC0 = vldrbq_gather_offset_z_u16(pTargetCh0, vStride4Offs, \
|
|
tailPred); \
|
|
uint16x8_t vecTargetC1 = vldrbq_gather_offset_z_u16(pTargetCh1, vStride4Offs, \
|
|
tailPred); \
|
|
uint16x8_t vecTargetC2 = vldrbq_gather_offset_z_u16(pTargetCh2, vStride4Offs, \
|
|
tailPred); \
|
|
uint16x8_t vecTransp = TRGT_LOAD(pAlpha, STRIDE, tailPred); \
|
|
\
|
|
vecTransp = SCAL_OPACITY(vecTransp, OPACITY, tailPred); \
|
|
\
|
|
uint16x8_t vecAlpha = vsubq_x_u16(v256, vecTransp, tailPred); \
|
|
\
|
|
/* scale ch0 vector with alpha vector */ \
|
|
vecTargetC0 = vmulq_x(vecTargetC0, vecAlpha, tailPred); \
|
|
/* blend ch0 vector with input ch0 color*/ \
|
|
vecTargetC0 = vmlaq_m(vecTargetC0, vecTransp, (uint16_t) c0, tailPred); \
|
|
vecTargetC0 = vecTargetC0 >> 8; \
|
|
\
|
|
/* repeat for ch1 and ch2 */ \
|
|
vecTargetC1 = vmulq_x(vecTargetC1, vecAlpha, tailPred); \
|
|
vecTargetC1 = vmlaq_m(vecTargetC1, vecTransp, (uint16_t) c1, tailPred); \
|
|
vecTargetC1 = vecTargetC1 >> 8; \
|
|
\
|
|
vecTargetC2 = vmulq_x(vecTargetC2, vecAlpha, tailPred); \
|
|
vecTargetC2 = vmlaq_m(vecTargetC2, vecTransp, (uint16_t) c2, tailPred); \
|
|
vecTargetC2 = vecTargetC2 >> 8; \
|
|
\
|
|
/* store and merge chan0, chan1, chan2 */ \
|
|
vstrbq_scatter_offset_p_u16(pTargetCh0, vStride4Offs, vecTargetC0, tailPred); \
|
|
vstrbq_scatter_offset_p_u16(pTargetCh1, vStride4Offs, vecTargetC1, tailPred); \
|
|
vstrbq_scatter_offset_p_u16(pTargetCh2, vStride4Offs, vecTargetC2, tailPred); \
|
|
\
|
|
pAlpha += 8 * ALPHA_SZ; \
|
|
pTargetCh0 += 8*4; \
|
|
pTargetCh1 += 8*4; \
|
|
pTargetCh2 += 8*4; \
|
|
blkCnt -= 8; \
|
|
} \
|
|
while (blkCnt > 0);
|
|
|
|
|
|
|
|
#define C8BIT_TRGT_LOAD(base, stride, pred) vldrbq_z_u16(base, pred)
|
|
#define C8BIT_TRGT_LOAD_STRIDE(base, stride, pred) vldrbq_gather_offset_z_u16(base, stride, pred);
|
|
#define C8BIT_SCAL_OPACITY_NONE(transp, opac, pred) transp
|
|
#define C8BIT_SCAL_OPACITY(transp, opac, pred) (uint16x8_t) vmulhq_x((uint8x16_t) transp, opac, pred)
|
|
|
|
#define RGB565_TRGT_LOAD(base, stride) vldrbq_u16(base)
|
|
#define RGB565_TRGT_LOAD_STRIDE(base, stride) vldrbq_gather_offset_u16(base, stride);
|
|
#define RGB565_SCAL_OPACITY_NONE(transp, opac) transp
|
|
#define RGB565_SCAL_OPACITY(transp, opac) (uint16x8_t) vmulhq((uint8x16_t) transp, opac)
|
|
|
|
#define CCCN888_TRGT_LOAD(base, stride, pred) vldrbq_z_u16(base, pred)
|
|
#define CCCN888_TRGT_LOAD_STRIDE(base, stride, pred) vldrbq_gather_offset_z_u16(base, stride, pred);
|
|
#define CCCN888_SCAL_OPACITY_NONE(transp, opac, pred) transp
|
|
#define CCCN888_SCAL_OPACITY(transp, opac, pred) (uint16x8_t) vmulhq_x((uint8x16_t) transp, opac, pred)
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_gray8_colour_filling_mask(uint8_t * __RESTRICT pTarget,
|
|
int16_t iTargetStride,
|
|
uint8_t * __RESTRICT pchAlpha,
|
|
int16_t iAlphaStride,
|
|
arm_2d_size_t * __RESTRICT ptCopySize,
|
|
uint8_t Colour)
|
|
{
|
|
int_fast16_t iHeight = ptCopySize->iHeight;
|
|
int_fast16_t iWidth = ptCopySize->iWidth;
|
|
uint16x8_t v256 = vdupq_n_u16(256);
|
|
|
|
for (int_fast16_t y = 0; y < iHeight; y++) {
|
|
const uint8_t * pAlpha = pchAlpha;
|
|
uint8_t * pTarget8 = pTarget;
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
C8BIT_COLOUR_FILLING_MASK_INNER_MVE(C8BIT_TRGT_LOAD, _,
|
|
C8BIT_SCAL_OPACITY_NONE, _, 1);
|
|
#else
|
|
register unsigned blkCnt __asm("lr");
|
|
blkCnt = iWidth;
|
|
|
|
__asm volatile(
|
|
".p2align 2 \n"
|
|
" vldrb.u16 q0, [%[pTarget]] \n"
|
|
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
|
|
" wlstp.16 lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
" vsub.i16 q2, %[vec256], q1 \n"
|
|
" vmul.u16 q3, q0, q2 \n"
|
|
" vldrb.u16 q0, [%[pTarget], #8] \n"
|
|
" vmla.u16 q3, q1, %[Colour] \n"
|
|
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
|
|
" vshr.u16 q3, q3, #8 \n"
|
|
" vstrb.u16 q3, [%[pTarget]], #8 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [pTarget] "+r"(pTarget8), [pAlpha] "+r" (pAlpha), [loopCnt] "+r"(blkCnt)
|
|
:[vec256] "t" (v256),[Colour] "r"(Colour)
|
|
:"q0", "q1", "q2", "q3", "memory");
|
|
|
|
#endif
|
|
pchAlpha += (iAlphaStride);
|
|
pTarget += (iTargetStride);
|
|
}
|
|
}
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_gray8_colour_filling_mask_opacity(uint8_t * __RESTRICT pTarget,
|
|
int16_t iTargetStride,
|
|
uint8_t * __RESTRICT pchAlpha,
|
|
int16_t iAlphaStride,
|
|
arm_2d_size_t *
|
|
__RESTRICT ptCopySize,
|
|
uint8_t Colour, uint8_t chOpacity)
|
|
{
|
|
int_fast16_t iHeight = ptCopySize->iHeight;
|
|
int_fast16_t iWidth = ptCopySize->iWidth;
|
|
uint8x16_t vOpacity = vdupq_n_u8(chOpacity);
|
|
uint16x8_t v256 = vdupq_n_u16(256);
|
|
|
|
for (int_fast16_t y = 0; y < iHeight; y++) {
|
|
const uint8_t * pAlpha = pchAlpha;
|
|
uint8_t * pTarget8 = pTarget;
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
C8BIT_COLOUR_FILLING_MASK_INNER_MVE(C8BIT_TRGT_LOAD, _,
|
|
C8BIT_SCAL_OPACITY, vOpacity, 1);
|
|
#else
|
|
register unsigned blkCnt __asm("lr");
|
|
blkCnt = iWidth;
|
|
|
|
__asm volatile(
|
|
".p2align 2 \n"
|
|
" vldrb.u16 q0, [%[pTarget]] \n"
|
|
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
|
|
" vmulh.u8 q1, q1, %[vOpacity] \n"
|
|
" wlstp.16 lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
" vsub.i16 q2, %[vec256], q1 \n"
|
|
" vmul.u16 q3, q0, q2 \n"
|
|
" vldrb.u16 q0, [%[pTarget], #8] \n"
|
|
" vmla.u16 q3, q1, %[Colour] \n"
|
|
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
|
|
" vmulh.u8 q1, q1, %[vOpacity] \n"
|
|
" vshr.u16 q3, q3, #8 \n"
|
|
" vstrb.u16 q3, [%[pTarget]], #8 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [pTarget] "+r"(pTarget8), [pAlpha] "+r" (pAlpha), [loopCnt] "+r"(blkCnt)
|
|
:[vec256] "t" (v256),[Colour] "r"(Colour),[vOpacity] "t"(vOpacity)
|
|
:"q0", "q1", "q2", "q3", "memory");
|
|
|
|
#endif
|
|
pchAlpha += (iAlphaStride);
|
|
pTarget += (iTargetStride);
|
|
}
|
|
}
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_gray8_colour_filling_channel_mask(uint8_t * __RESTRICT pTarget,
|
|
int16_t iTargetStride,
|
|
uint32_t * __RESTRICT pwAlpha,
|
|
int16_t iAlphaStride,
|
|
arm_2d_size_t * __RESTRICT ptCopySize,
|
|
uint8_t Colour)
|
|
{
|
|
int_fast16_t iHeight = ptCopySize->iHeight;
|
|
int_fast16_t iWidth = ptCopySize->iWidth;
|
|
uint16x8_t v256 = vdupq_n_u16(256);
|
|
uint16x8_t vStride4Offs = vidupq_n_u16(0, 4);
|
|
|
|
for (int_fast16_t y = 0; y < iHeight; y++) {
|
|
const uint8_t *pAlpha = (const uint8_t *)pwAlpha;
|
|
uint8_t * pTarget8 = pTarget;
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
C8BIT_COLOUR_FILLING_MASK_INNER_MVE(C8BIT_TRGT_LOAD_STRIDE, vStride4Offs,
|
|
C8BIT_SCAL_OPACITY_NONE, _, 4);
|
|
#else
|
|
register unsigned blkCnt __asm("lr");
|
|
blkCnt = iWidth;
|
|
|
|
__asm volatile(
|
|
".p2align 2 \n"
|
|
" vldrb.u16 q0, [%[pTarget]] \n"
|
|
" vldrb.u16 q1, [%[pAlpha], %[str4Offs]] \n"
|
|
" wlstp.16 lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
" vsub.i16 q2, %[vec256], q1 \n"
|
|
" vmul.u16 q3, q0, q2 \n"
|
|
" add %[pAlpha], %[pAlpha], #(8*4) \n"
|
|
" vldrb.u16 q0, [%[pTarget], #8] \n"
|
|
" vmla.u16 q3, q1, %[Colour] \n"
|
|
" vldrb.u16 q1, [%[pAlpha], %[str4Offs]] \n"
|
|
" vshr.u16 q3, q3, #8 \n"
|
|
" vstrb.u16 q3, [%[pTarget]], #8 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [pTarget] "+r"(pTarget8), [pAlpha] "+r" (pAlpha), [loopCnt] "+r"(blkCnt)
|
|
:[vec256] "t" (v256),[Colour] "r"(Colour),[str4Offs] "t"(vStride4Offs)
|
|
:"q0", "q1", "q2", "q3", "memory");
|
|
|
|
#endif
|
|
pwAlpha += (iAlphaStride);
|
|
pTarget += (iTargetStride);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_gray8_colour_filling_channel_mask_opacity(uint8_t * __RESTRICT pTarget,
|
|
int16_t iTargetStride,
|
|
uint32_t * __RESTRICT pwAlpha,
|
|
int16_t iAlphaStride,
|
|
arm_2d_size_t *
|
|
__RESTRICT ptCopySize,
|
|
uint8_t Colour, uint8_t chOpacity)
|
|
{
|
|
int_fast16_t iHeight = ptCopySize->iHeight;
|
|
int_fast16_t iWidth = ptCopySize->iWidth;
|
|
uint8x16_t vOpacity = vdupq_n_u8(chOpacity);
|
|
uint16x8_t v256 = vdupq_n_u16(256);
|
|
uint16x8_t vStride4Offs = vidupq_n_u16(0, 4);
|
|
|
|
for (int_fast16_t y = 0; y < iHeight; y++) {
|
|
const uint8_t *pAlpha = (const uint8_t *)pwAlpha;
|
|
uint8_t *pTarget8 = pTarget;
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
C8BIT_COLOUR_FILLING_MASK_INNER_MVE(C8BIT_TRGT_LOAD_STRIDE, vStride4Offs,
|
|
C8BIT_SCAL_OPACITY, vOpacity, 4);
|
|
#else
|
|
register unsigned blkCnt __asm("lr");
|
|
blkCnt = iWidth;
|
|
|
|
__asm volatile(
|
|
".p2align 2 \n"
|
|
" vldrb.u16 q0, [%[pTarget]] \n"
|
|
" vldrb.u16 q1, [%[pAlpha], %[str4Offs]] \n"
|
|
" vmulh.u8 q1, q1, %[vOpacity] \n"
|
|
" wlstp.16 lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
" vsub.i16 q2, %[vec256], q1 \n"
|
|
" vmul.u16 q3, q0, q2 \n"
|
|
" vldrb.u16 q0, [%[pTarget], #8] \n"
|
|
" add %[pAlpha], %[pAlpha], #(8*4) \n"
|
|
" vmla.u16 q3, q1, %[Colour] \n"
|
|
" vldrb.u16 q1, [%[pAlpha], %[str4Offs]] \n"
|
|
" vmulh.u8 q1, q1, %[vOpacity] \n"
|
|
" vshr.u16 q3, q3, #8 \n"
|
|
" vstrb.u16 q3, [%[pTarget]], #8 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [pTarget] "+r"(pTarget8), [pAlpha] "+r" (pAlpha), [loopCnt] "+r"(blkCnt)
|
|
:[vec256] "t" (v256),[Colour] "r"(Colour),[vOpacity] "t"(vOpacity),
|
|
[str4Offs] "t"(vStride4Offs)
|
|
:"q0", "q1", "q2", "q3", "memory");
|
|
|
|
#endif
|
|
pwAlpha += (iAlphaStride);
|
|
pTarget += (iTargetStride);
|
|
}
|
|
}
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_rgb565_colour_filling_mask(uint16_t * __RESTRICT pTarget,
|
|
int16_t iTargetStride,
|
|
uint8_t * __RESTRICT pchAlpha,
|
|
int16_t iAlphaStride,
|
|
arm_2d_size_t * __RESTRICT ptCopySize,
|
|
uint16_t Colour)
|
|
{
|
|
int_fast16_t iHeight = ptCopySize->iHeight;
|
|
int_fast16_t iWidth = ptCopySize->iWidth;
|
|
__arm_2d_color_fast_rgb_t tSrcPix;
|
|
|
|
__arm_2d_rgb565_unpack(*(&Colour), &tSrcPix);
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
RGB565_COLOUR_FILLING_MASK_MVE(RGB565_TRGT_LOAD, _,
|
|
RGB565_SCAL_OPACITY_NONE, _, pchAlpha, 1);
|
|
#else
|
|
/* RGB565 pack/unpack Masks */
|
|
/* use memory rather than vmov to optimize Helium operations interleaving */
|
|
uint16x8_t scratch[4];
|
|
|
|
// Unpacking Mask Red
|
|
vst1q((uint16_t*)&scratch[0], vdupq_n_u16(0x1f));
|
|
// Unpacking Mask Green
|
|
vst1q((uint16_t*)&scratch[1], vdupq_n_u16(0x3f));
|
|
// packing Mask Green
|
|
vst1q((uint16_t*)&scratch[2], vdupq_n_u16(0xfc));
|
|
// packing Mask Blue
|
|
vst1q((uint16_t*)&scratch[3], vdupq_n_u16(0xf8));
|
|
|
|
for (int_fast16_t y = 0; y < iHeight; y++) {
|
|
const uint8_t *pAlpha = pchAlpha;
|
|
uint16_t *pCurTarget = pTarget;
|
|
register unsigned blkCnt __asm("lr");
|
|
blkCnt = iWidth;
|
|
|
|
__asm volatile (
|
|
".p2align 2 \n"
|
|
/* load scheduling */
|
|
" vldrh.u16 q0, [%[pTarget]] \n"
|
|
" vmov.i16 q7, #0x0100 \n"
|
|
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
|
|
|
|
" wlstp.16 lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
|
|
// vecAlpha
|
|
" vsub.i16 q2, q7, q1 \n"
|
|
|
|
/* RGB565 unpack */
|
|
|
|
/* vecAlpha * 4 for G channel upscale */
|
|
" vmul.i16 q2, q2, %[four] \n"
|
|
/* G channel extract */
|
|
" vshr.u16 q5, q0, #5 \n"
|
|
|
|
/* load Unpacking Mask for R channel */
|
|
" vldrh.u16 q7, [%[scratch], #(0*16)] \n"
|
|
" vand q4, q0, q7 \n"
|
|
|
|
/* load Unpacking Mask for G channel */
|
|
" vldrh.u16 q7, [%[scratch], #(1*16)] \n"
|
|
" vand q5, q5, q7 \n"
|
|
/* scale G vector with alpha vector */
|
|
" vmul.u16 q5, q5, q2 \n"
|
|
|
|
/* B channel */
|
|
" vshr.u16 q6, q0, #11 \n"
|
|
|
|
/* blend G vector with input G color*/
|
|
" vmla.u16 q5, q1, %[G] \n"
|
|
|
|
/* vecAlpha * 8 for R & B upscale */
|
|
" vshl.i16 q2, q2, #1 \n"
|
|
|
|
/* scale R vector with alpha vec */
|
|
" vmul.u16 q4, q4, q2 \n"
|
|
|
|
" vshr.u16 q5, q5, #8 \n"
|
|
|
|
/* blend R vector with input R color*/
|
|
" vmla.u16 q4, q1, %[R] \n"
|
|
|
|
/* load packing Mask for G channel */
|
|
" vldrh.u16 q7, [%[scratch], #(2*16)] \n"
|
|
|
|
/* scale B vector with alpha vector */
|
|
" vmul.u16 q6, q6, q2 \n"
|
|
" vand q5, q5, q7 \n"
|
|
|
|
/* blend B vector with input B color*/
|
|
" vmla.u16 q6, q1, %[B] \n"
|
|
|
|
/* load packing Mask for B channel */
|
|
" vldrh.u16 q7, [%[scratch], #(3*16)] \n"
|
|
|
|
" vshr.u16 q6, q6, #8 \n"
|
|
|
|
/* RGB 565 pack */
|
|
|
|
/* (G & 0x00fc), 8) */
|
|
" vmul.i16 q5, q5, %[eight] \n"
|
|
|
|
/* (B & 0x00f8) */
|
|
" vand q6, q6, q7 \n"
|
|
|
|
/* load next alpha vector */
|
|
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
|
|
" vmov.i16 q7, #0x0100 \n"
|
|
|
|
/* pack G & B */
|
|
" vmla.u16 q5, q6, %[twofiftysix] \n"
|
|
/* combined (R >> 8) >> 3 */
|
|
" vshr.u16 q4, q4, #11 \n"
|
|
|
|
/* load next target */
|
|
" vldrh.u16 q0, [%[pTarget], #16] \n"
|
|
|
|
/* pack R */
|
|
" vorr q4, q4, q5 \n"
|
|
" vstrh.16 q4, [%[pTarget]], #16 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
:[pTarget]"+r"(pCurTarget),[pAlpha] "+r"(pAlpha),[loopCnt] "+r"(blkCnt)
|
|
:[Colour] "r"(Colour), [eight] "r" (8), [four] "r" (4),
|
|
[R] "r" (tSrcPix.R), [G] "r" (tSrcPix.G), [B] "r" (tSrcPix.B),
|
|
[twofiftysix] "r" (256), [scratch] "r" (scratch)
|
|
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory");
|
|
|
|
pchAlpha += (iAlphaStride);
|
|
pTarget += (iTargetStride);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_rgb565_colour_filling_mask_opacity(uint16_t * __RESTRICT pTarget,
|
|
int16_t iTargetStride,
|
|
uint8_t * __RESTRICT pchAlpha,
|
|
int16_t iAlphaStride,
|
|
arm_2d_size_t * __RESTRICT ptCopySize,
|
|
uint16_t Colour, uint8_t chOpacity)
|
|
{
|
|
int_fast16_t iHeight = ptCopySize->iHeight;
|
|
int_fast16_t iWidth = ptCopySize->iWidth;
|
|
uint8x16_t vOpacity = vdupq_n_u8(chOpacity);
|
|
|
|
__arm_2d_color_fast_rgb_t tSrcPix;
|
|
|
|
__arm_2d_rgb565_unpack(*(&Colour), &tSrcPix);
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
RGB565_COLOUR_FILLING_MASK_MVE(RGB565_TRGT_LOAD, _,
|
|
RGB565_SCAL_OPACITY, vOpacity, pchAlpha, 1);
|
|
#else
|
|
/* RGB565 pack/unpack Masks + opacity */
|
|
/* use memory rather than vmov to optimize Helium operations interleaving */
|
|
uint16x8_t scratch[5];
|
|
|
|
// Unpacking Mask Red
|
|
vst1q((uint16_t*)&scratch[0], vdupq_n_u16(0x1f));
|
|
// Unpacking Mask Green
|
|
vst1q((uint16_t*)&scratch[1], vdupq_n_u16(0x3f));
|
|
// packing Mask Green
|
|
vst1q((uint16_t*)&scratch[2], vdupq_n_u16(0xfc));
|
|
// packing Mask Blue
|
|
vst1q((uint16_t*)&scratch[3], vdupq_n_u16(0xf8));
|
|
// opacity
|
|
vst1q((uint16_t*)&scratch[4], (uint16x8_t)vOpacity);
|
|
|
|
|
|
for (int_fast16_t y = 0; y < iHeight; y++) {
|
|
const uint8_t *pAlpha = pchAlpha;
|
|
uint16_t *pCurTarget = pTarget;
|
|
register unsigned blkCnt __asm("lr");
|
|
blkCnt = iWidth;
|
|
|
|
__asm volatile (
|
|
".p2align 2 \n"
|
|
/* load scheduling */
|
|
" vldrh.u16 q0, [%[pTarget]] \n"
|
|
" vmov.i16 q7, #0x0100 \n"
|
|
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
|
|
/* opacity vector */
|
|
" vldrh.u16 q6, [%[scratch], #(4*16)] \n"
|
|
" vmulh.u8 q1, q1, q6 \n"
|
|
|
|
" wlstp.16 lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
|
|
// vecAlpha
|
|
" vsub.i16 q2, q7, q1 \n"
|
|
|
|
/* RGB565 unpack */
|
|
|
|
/* vecAlpha * 4 for G channel upscale */
|
|
" vmul.i16 q2, q2, %[four] \n"
|
|
/* G channel extract */
|
|
" vshr.u16 q5, q0, #5 \n"
|
|
|
|
/* load Unpacking Mask for R channel */
|
|
" vldrh.u16 q7, [%[scratch], #(0*16)] \n"
|
|
" vand q4, q0, q7 \n"
|
|
|
|
/* load Unpacking Mask for G channel */
|
|
" vldrh.u16 q7, [%[scratch], #(1*16)] \n"
|
|
" vand q5, q5, q7 \n"
|
|
/* scale G vector with alpha vector */
|
|
" vmul.u16 q5, q5, q2 \n"
|
|
|
|
/* B channel */
|
|
" vshr.u16 q6, q0, #11 \n"
|
|
|
|
/* blend G vector with input G color*/
|
|
" vmla.u16 q5, q1, %[G] \n"
|
|
|
|
/* vecAlpha * 8 for R & B upscale */
|
|
" vshl.i16 q2, q2, #1 \n"
|
|
|
|
/* scale R vector with alpha vec */
|
|
" vmul.u16 q4, q4, q2 \n"
|
|
|
|
" vshr.u16 q5, q5, #8 \n"
|
|
|
|
/* blend R vector with input R color*/
|
|
" vmla.u16 q4, q1, %[R] \n"
|
|
|
|
/* load packing Mask for G channel */
|
|
" vldrh.u16 q7, [%[scratch], #(2*16)] \n"
|
|
|
|
/* scale B vector with alpha vector */
|
|
" vmul.u16 q6, q6, q2 \n"
|
|
" vand q5, q5, q7 \n"
|
|
|
|
/* blend B vector with input B color*/
|
|
" vmla.u16 q6, q1, %[B] \n"
|
|
|
|
/* load packing Mask for B channel */
|
|
" vldrh.u16 q7, [%[scratch], #(3*16)] \n"
|
|
|
|
" vshr.u16 q6, q6, #8 \n"
|
|
|
|
/* RGB 565 pack */
|
|
|
|
/* (G & 0x00fc), 8) */
|
|
" vmul.i16 q5, q5, %[eight] \n"
|
|
|
|
/* (B & 0x00f8) */
|
|
" vand q6, q6, q7 \n"
|
|
|
|
/* load next alpha vector */
|
|
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
|
|
" vmov.i16 q7, #0x0100 \n"
|
|
|
|
/* pack G & B */
|
|
" vmla.u16 q5, q6, %[twofiftysix] \n"
|
|
/* reload opacity and scale alpha */
|
|
" vldrh.u16 q6, [%[scratch], #(4*16)] \n"
|
|
" vmulh.u8 q1, q1, q6 \n"
|
|
|
|
/* combined (R >> 8) >> 3 */
|
|
" vshr.u16 q4, q4, #11 \n"
|
|
|
|
/* load next target */
|
|
" vldrh.u16 q0, [%[pTarget], #16] \n"
|
|
|
|
/* pack R */
|
|
" vorr q4, q4, q5 \n"
|
|
" vstrh.16 q4, [%[pTarget]], #16 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
:[pTarget]"+r"(pCurTarget),[pAlpha] "+r"(pAlpha),[loopCnt] "+r"(blkCnt)
|
|
:[Colour] "r"(Colour), [eight] "r" (8), [four] "r" (4),
|
|
[R] "r" (tSrcPix.R), [G] "r" (tSrcPix.G), [B] "r" (tSrcPix.B),
|
|
[twofiftysix] "r" (256), [scratch] "r" (scratch)
|
|
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory");
|
|
|
|
pchAlpha += (iAlphaStride);
|
|
pTarget += (iTargetStride);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_rgb565_colour_filling_channel_mask(uint16_t * __RESTRICT pTarget,
|
|
int16_t iTargetStride,
|
|
uint32_t * __RESTRICT pwAlpha,
|
|
int16_t iAlphaStride,
|
|
arm_2d_size_t * __RESTRICT ptCopySize,
|
|
uint16_t Colour)
|
|
{
|
|
int_fast16_t iHeight = ptCopySize->iHeight;
|
|
int_fast16_t iWidth = ptCopySize->iWidth;
|
|
uint16x8_t vStride4Offs = vidupq_n_u16(0, 4);
|
|
__arm_2d_color_fast_rgb_t tSrcPix;
|
|
|
|
__arm_2d_rgb565_unpack(*(&Colour), &tSrcPix);
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
RGB565_COLOUR_FILLING_MASK_MVE(RGB565_TRGT_LOAD_STRIDE, vStride4Offs,
|
|
RGB565_SCAL_OPACITY_NONE, _, pwAlpha, 4);
|
|
#else
|
|
/* RGB565 pack/unpack Masks */
|
|
/* use memory rather than vmov to optimize Helium operations interleaving */
|
|
uint16x8_t scratch[4];
|
|
|
|
// Unpacking Mask Red
|
|
vst1q((uint16_t*)&scratch[0], vdupq_n_u16(0x1f));
|
|
// Unpacking Mask Green
|
|
vst1q((uint16_t*)&scratch[1], vdupq_n_u16(0x3f));
|
|
// packing Mask Green
|
|
vst1q((uint16_t*)&scratch[2], vdupq_n_u16(0xfc));
|
|
// packing Mask Blue
|
|
vst1q((uint16_t*)&scratch[3], vdupq_n_u16(0xf8));
|
|
|
|
for (int_fast16_t y = 0; y < iHeight; y++) {
|
|
const uint32_t *pAlpha = pwAlpha;
|
|
uint16_t *pCurTarget = pTarget;
|
|
register unsigned blkCnt __asm("lr");
|
|
blkCnt = iWidth;
|
|
|
|
__asm volatile (
|
|
".p2align 2 \n"
|
|
/* load scheduling */
|
|
" vldrh.u16 q0, [%[pTarget]] \n"
|
|
|
|
" vmov.i16 q7, #0x0100 \n"
|
|
" vldrb.u16 q1, [%[pAlpha],%[str4Offs]]\n"
|
|
|
|
" wlstp.16 lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
" add %[pAlpha], %[pAlpha],#(8*4)\n"
|
|
// vecAlpha
|
|
" vsub.i16 q2, q7, q1 \n"
|
|
|
|
/* RGB565 unpack */
|
|
|
|
/* vecAlpha * 4 for G channel upscale */
|
|
" vmul.i16 q2, q2, %[four] \n"
|
|
/* G channel extract */
|
|
" vshr.u16 q5, q0, #5 \n"
|
|
|
|
/* load Unpacking Mask for R channel */
|
|
" vldrh.u16 q7, [%[scratch], #(0*16)] \n"
|
|
" vand q4, q0, q7 \n"
|
|
|
|
/* load Unpacking Mask for G channel */
|
|
" vldrh.u16 q7, [%[scratch], #(1*16)] \n"
|
|
" vand q5, q5, q7 \n"
|
|
/* scale G vector with alpha vector */
|
|
" vmul.u16 q5, q5, q2 \n"
|
|
|
|
/* B channel */
|
|
" vshr.u16 q6, q0, #11 \n"
|
|
|
|
/* blend G vector with input G color*/
|
|
" vmla.u16 q5, q1, %[G] \n"
|
|
|
|
/* vecAlpha * 8 for R & B upscale */
|
|
" vshl.i16 q2, q2, #1 \n"
|
|
|
|
/* scale R vector with alpha vec */
|
|
" vmul.u16 q4, q4, q2 \n"
|
|
|
|
" vshr.u16 q5, q5, #8 \n"
|
|
|
|
/* blend R vector with input R color*/
|
|
" vmla.u16 q4, q1, %[R] \n"
|
|
|
|
/* load packing Mask for G channel */
|
|
" vldrh.u16 q7, [%[scratch], #(2*16)] \n"
|
|
|
|
/* scale B vector with alpha vector */
|
|
" vmul.u16 q6, q6, q2 \n"
|
|
" vand q5, q5, q7 \n"
|
|
|
|
/* blend B vector with input B color*/
|
|
" vmla.u16 q6, q1, %[B] \n"
|
|
|
|
/* load packing Mask for B channel */
|
|
" vldrh.u16 q7, [%[scratch], #(3*16)] \n"
|
|
|
|
" vshr.u16 q6, q6, #8 \n"
|
|
|
|
/* RGB 565 pack */
|
|
|
|
/* (G & 0x00fc), 8) */
|
|
" vmul.i16 q5, q5, %[eight] \n"
|
|
|
|
/* (B & 0x00f8) */
|
|
" vand q6, q6, q7 \n"
|
|
|
|
/* load next alpha vector */
|
|
" vldrb.u16 q1, [%[pAlpha],%[str4Offs]]\n"
|
|
|
|
" vmov.i16 q7, #0x0100 \n"
|
|
|
|
/* pack G & B */
|
|
" vmla.u16 q5, q6, %[twofiftysix] \n"
|
|
/* combined (R >> 8) >> 3 */
|
|
" vshr.u16 q4, q4, #11 \n"
|
|
|
|
/* load next target */
|
|
" vldrh.u16 q0, [%[pTarget], #16] \n"
|
|
|
|
/* pack R */
|
|
" vorr q4, q4, q5 \n"
|
|
" vstrh.16 q4, [%[pTarget]], #16 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
:[pTarget]"+r"(pCurTarget),[pAlpha] "+r"(pAlpha),[loopCnt] "+r"(blkCnt)
|
|
:[Colour] "r"(Colour), [eight] "r" (8), [four] "r" (4),
|
|
[R] "r" (tSrcPix.R), [G] "r" (tSrcPix.G), [B] "r" (tSrcPix.B),
|
|
[twofiftysix] "r" (256), [scratch] "r" (scratch), [str4Offs] "t"(vStride4Offs)
|
|
:"q0", "q1", "q2", "q4", "q5", "q6", "q7", "memory");
|
|
|
|
pwAlpha += (iAlphaStride);
|
|
pTarget += (iTargetStride);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_rgb565_colour_filling_channel_mask_opacity(uint16_t * __RESTRICT pTarget,
|
|
int16_t iTargetStride,
|
|
uint32_t * __RESTRICT pwAlpha,
|
|
int16_t iAlphaStride,
|
|
arm_2d_size_t * __RESTRICT ptCopySize,
|
|
uint16_t Colour, uint8_t chOpacity)
|
|
{
|
|
int_fast16_t iHeight = ptCopySize->iHeight;
|
|
int_fast16_t iWidth = ptCopySize->iWidth;
|
|
uint16x8_t vStride4Offs = vidupq_n_u16(0, 4);
|
|
uint8x16_t vOpacity = vdupq_n_u8(chOpacity);
|
|
__arm_2d_color_fast_rgb_t tSrcPix;
|
|
|
|
__arm_2d_rgb565_unpack(*(&Colour), &tSrcPix);
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
|
|
RGB565_COLOUR_FILLING_MASK_MVE(RGB565_TRGT_LOAD_STRIDE, vStride4Offs,
|
|
RGB565_SCAL_OPACITY, vOpacity, pwAlpha, 4);
|
|
#else
|
|
/* RGB565 pack/unpack Masks + opacity */
|
|
/* use memory rather than vmov to optimize Helium operations interleaving */
|
|
uint16x8_t scratch[5];
|
|
|
|
// Unpacking Mask Red
|
|
vst1q((uint16_t*)&scratch[0], vdupq_n_u16(0x1f));
|
|
// Unpacking Mask Green
|
|
vst1q((uint16_t*)&scratch[1], vdupq_n_u16(0x3f));
|
|
// packing Mask Green
|
|
vst1q((uint16_t*)&scratch[2], vdupq_n_u16(0xfc));
|
|
// packing Mask Blue
|
|
vst1q((uint16_t*)&scratch[3], vdupq_n_u16(0xf8));
|
|
// opacity
|
|
vst1q((uint16_t*)&scratch[4], (uint16x8_t)vOpacity);
|
|
|
|
for (int_fast16_t y = 0; y < iHeight; y++) {
|
|
const uint32_t *pAlpha = pwAlpha;
|
|
uint16_t *pCurTarget = pTarget;
|
|
register unsigned blkCnt __asm("lr");
|
|
blkCnt = iWidth;
|
|
|
|
__asm volatile (
|
|
".p2align 2 \n"
|
|
/* load scheduling */
|
|
" vldrh.u16 q0, [%[pTarget]] \n"
|
|
|
|
" vmov.i16 q7, #0x0100 \n"
|
|
" vldrb.u16 q1, [%[pAlpha],%[str4Offs]]\n"
|
|
/* opacity vector */
|
|
" vldrh.u16 q6, [%[scratch], #(4*16)] \n"
|
|
" vmulh.u8 q1, q1, q6 \n"
|
|
|
|
" wlstp.16 lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
" add %[pAlpha], %[pAlpha],#(8*4)\n"
|
|
// vecAlpha
|
|
" vsub.i16 q2, q7, q1 \n"
|
|
|
|
/* RGB565 unpack */
|
|
|
|
/* vecAlpha * 4 for G channel upscale */
|
|
" vmul.i16 q2, q2, %[four] \n"
|
|
/* G channel extract */
|
|
" vshr.u16 q5, q0, #5 \n"
|
|
|
|
/* load Unpacking Mask for R channel */
|
|
" vldrh.u16 q7, [%[scratch], #(0*16)] \n"
|
|
" vand q4, q0, q7 \n"
|
|
|
|
/* load Unpacking Mask for G channel */
|
|
" vldrh.u16 q7, [%[scratch], #(1*16)] \n"
|
|
" vand q5, q5, q7 \n"
|
|
/* scale G vector with alpha vector */
|
|
" vmul.u16 q5, q5, q2 \n"
|
|
|
|
/* B channel */
|
|
" vshr.u16 q6, q0, #11 \n"
|
|
|
|
/* blend G vector with input G color*/
|
|
" vmla.u16 q5, q1, %[G] \n"
|
|
|
|
/* vecAlpha * 8 for R & B upscale */
|
|
" vshl.i16 q2, q2, #1 \n"
|
|
|
|
/* scale R vector with alpha vec */
|
|
" vmul.u16 q4, q4, q2 \n"
|
|
|
|
" vshr.u16 q5, q5, #8 \n"
|
|
|
|
/* blend R vector with input R color*/
|
|
" vmla.u16 q4, q1, %[R] \n"
|
|
|
|
/* load packing Mask for G channel */
|
|
" vldrh.u16 q7, [%[scratch], #(2*16)] \n"
|
|
|
|
/* scale B vector with alpha vector */
|
|
" vmul.u16 q6, q6, q2 \n"
|
|
" vand q5, q5, q7 \n"
|
|
|
|
/* blend B vector with input B color*/
|
|
" vmla.u16 q6, q1, %[B] \n"
|
|
|
|
/* load packing Mask for B channel */
|
|
" vldrh.u16 q7, [%[scratch], #(3*16)] \n"
|
|
|
|
" vshr.u16 q6, q6, #8 \n"
|
|
|
|
/* RGB 565 pack */
|
|
|
|
/* (G & 0x00fc), 8) */
|
|
" vmul.i16 q5, q5, %[eight] \n"
|
|
|
|
/* (B & 0x00f8) */
|
|
" vand q6, q6, q7 \n"
|
|
|
|
/* load next alpha vector */
|
|
" vldrb.u16 q1, [%[pAlpha],%[str4Offs]]\n"
|
|
|
|
" vmov.i16 q7, #0x0100 \n"
|
|
|
|
/* pack G & B */
|
|
" vmla.u16 q5, q6, %[twofiftysix] \n"
|
|
/* combined (R >> 8) >> 3 */
|
|
" vldrh.u16 q6, [%[scratch], #(4*16)] \n"
|
|
" vmulh.u8 q1, q1, q6 \n"
|
|
" vshr.u16 q4, q4, #11 \n"
|
|
|
|
/* load next target */
|
|
" vldrh.u16 q0, [%[pTarget], #16] \n"
|
|
|
|
/* pack R */
|
|
" vorr q4, q4, q5 \n"
|
|
" vstrh.16 q4, [%[pTarget]], #16 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
:[pTarget]"+r"(pCurTarget),[pAlpha] "+r"(pAlpha),[loopCnt] "+r"(blkCnt)
|
|
:[Colour] "r"(Colour), [eight] "r" (8), [four] "r" (4),
|
|
[R] "r" (tSrcPix.R), [G] "r" (tSrcPix.G), [B] "r" (tSrcPix.B),
|
|
[twofiftysix] "r" (256), [scratch] "r" (scratch), [str4Offs] "t"(vStride4Offs)
|
|
:"q0", "q1", "q2", "q4", "q5", "q6", "q7", "memory");
|
|
|
|
pwAlpha += (iAlphaStride);
|
|
pTarget += (iTargetStride);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_cccn888_colour_filling_mask(uint32_t * __RESTRICT pTarget,
|
|
int16_t iTargetStride,
|
|
uint8_t * __RESTRICT pchAlpha,
|
|
int16_t iAlphaStride,
|
|
arm_2d_size_t * __RESTRICT ptCopySize,
|
|
uint32_t Colour)
|
|
{
|
|
int_fast16_t iHeight = ptCopySize->iHeight;
|
|
int_fast16_t iWidth = ptCopySize->iWidth;
|
|
uint16x8_t v256 = vdupq_n_u16(256);
|
|
uint16x8_t vStride4Offs = vidupq_n_u16(0, 4);
|
|
uint16_t c0, c1, c2;
|
|
|
|
c0 = Colour & 0xff;
|
|
c1 = (Colour >> 8) & 0xff;
|
|
c2 = (Colour >> 16) & 0xff;
|
|
|
|
for (int_fast16_t y = 0; y < iHeight; y++) {
|
|
const uint8_t * pAlpha = pchAlpha;
|
|
uint8_t * pTargetCh0 = (uint8_t*)pTarget;
|
|
uint8_t * pTargetCh1 = pTargetCh0 + 1;
|
|
uint8_t * pTargetCh2 = pTargetCh0 + 2;
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
|
|
CCCN888_COLOUR_FILLING_MASK_INNER_MVE(CCCN888_TRGT_LOAD, _,
|
|
CCCN888_SCAL_OPACITY_NONE, _, 1);
|
|
#else
|
|
|
|
register unsigned blkCnt __asm("lr");
|
|
blkCnt = iWidth;
|
|
|
|
__asm volatile(
|
|
".p2align 2 \n"
|
|
/* expand chan0 */
|
|
" vldrb.u16 q0, [%[pTargetCh0], %[str4Offs]] \n"
|
|
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
|
|
|
|
" wlstp.16 lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
|
|
" vsub.i16 q2, %[vec256], q1 \n"
|
|
|
|
/* scale ch0 vector with alpha vector */
|
|
" vmul.u16 q3, q0, q2 \n"
|
|
|
|
/* expand chan1 */
|
|
" vldrb.u16 q0, [%[pTargetCh1], %[str4Offs]] \n"
|
|
/* blend ch0 vector with input ch0 color*/
|
|
" vmla.u16 q3, q1, %[c0] \n"
|
|
" vshr.u16 q3, q3, #8 \n"
|
|
|
|
" vstrb.u16 q3, [%[pTargetCh0], %[str4Offs]] \n"
|
|
|
|
/* scale ch1 vector with alpha vector */
|
|
" vmul.u16 q3, q0, q2 \n"
|
|
|
|
/* expand chan2 */
|
|
" vldrb.u16 q0, [%[pTargetCh2], %[str4Offs]] \n"
|
|
/* blend ch1 vector with input ch1 color*/
|
|
" vmla.u16 q3, q1, %[c1] \n"
|
|
" vshr.u16 q3, q3, #8 \n"
|
|
" vstrb.u16 q3, [%[pTargetCh1], %[str4Offs]] \n"
|
|
|
|
" adds %[pTargetCh0], #32 \n"
|
|
" adds %[pTargetCh1], #32 \n"
|
|
|
|
/* scale ch2 vector with alpha vector */
|
|
" vmul.u16 q3, q0, q2 \n"
|
|
" vldrb.u16 q0, [%[pTargetCh0], %[str4Offs]] \n"
|
|
|
|
/* blend ch2 vector with input ch2 color*/
|
|
" vmla.u16 q3, q1, %[c2] \n"
|
|
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
|
|
|
|
" vshr.u16 q3, q3, #8 \n"
|
|
" vstrb.u16 q3, [%[pTargetCh2], %[str4Offs]] \n"
|
|
|
|
" add.w %[pTargetCh2], %[pTargetCh2], #32 \n"
|
|
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
:[pTargetCh0] "+r"(pTargetCh0), [pTargetCh1] "+r"(pTargetCh1),
|
|
[pTargetCh2] "+r"(pTargetCh2), [pAlpha] "+r" (pAlpha), [loopCnt] "+r"(blkCnt)
|
|
:[vec256] "t" (v256),[str4Offs] "t" (vStride4Offs),
|
|
[c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2)
|
|
:"q0", "q1", "q2", "q3", "memory");
|
|
|
|
#endif
|
|
pchAlpha += (iAlphaStride);
|
|
pTarget += (iTargetStride);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_cccn888_colour_filling_mask_opacity(uint32_t * __RESTRICT pTarget,
|
|
int16_t iTargetStride,
|
|
uint8_t * __RESTRICT pchAlpha,
|
|
int16_t iAlphaStride,
|
|
arm_2d_size_t * __RESTRICT ptCopySize,
|
|
uint32_t Colour, uint8_t chOpacity)
|
|
{
|
|
int_fast16_t iHeight = ptCopySize->iHeight;
|
|
int_fast16_t iWidth = ptCopySize->iWidth;
|
|
uint16x8_t v256 = vdupq_n_u16(256);
|
|
uint16x8_t vStride4Offs = vidupq_n_u16(0, 4);
|
|
uint8x16_t vOpacity = vdupq_n_u8(chOpacity);
|
|
uint16_t c0, c1, c2;
|
|
|
|
c0 = Colour & 0xff;
|
|
c1 = (Colour >> 8) & 0xff;
|
|
c2 = (Colour >> 16) & 0xff;
|
|
|
|
for (int_fast16_t y = 0; y < iHeight; y++) {
|
|
const uint8_t * pAlpha = pchAlpha;
|
|
uint8_t * pTargetCh0 = (uint8_t*)pTarget;
|
|
uint8_t * pTargetCh1 = pTargetCh0 + 1;
|
|
uint8_t * pTargetCh2 = pTargetCh0 + 2;
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
|
|
CCCN888_COLOUR_FILLING_MASK_INNER_MVE(CCCN888_TRGT_LOAD, _,
|
|
CCCN888_SCAL_OPACITY, vOpacity, 1);
|
|
#else
|
|
|
|
register unsigned blkCnt __asm("lr");
|
|
blkCnt = iWidth;
|
|
|
|
__asm volatile(
|
|
".p2align 2 \n"
|
|
/* expand chan0 */
|
|
" vldrb.u16 q0, [%[pTargetCh0], %[str4Offs]] \n"
|
|
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
|
|
" vmulh.u8 q1, q1, %[vOpacity] \n"
|
|
|
|
" wlstp.16 lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
|
|
" vsub.i16 q2, %[vec256], q1 \n"
|
|
|
|
/* scale ch0 vector with alpha vector */
|
|
" vmul.u16 q3, q0, q2 \n"
|
|
|
|
/* expand chan1 */
|
|
" vldrb.u16 q0, [%[pTargetCh1], %[str4Offs]] \n"
|
|
/* blend ch0 vector with input ch0 color*/
|
|
" vmla.u16 q3, q1, %[c0] \n"
|
|
" vshr.u16 q3, q3, #8 \n"
|
|
|
|
" vstrb.u16 q3, [%[pTargetCh0], %[str4Offs]] \n"
|
|
|
|
/* scale ch1 vector with alpha vector */
|
|
" vmul.u16 q3, q0, q2 \n"
|
|
|
|
/* expand chan2 */
|
|
" vldrb.u16 q0, [%[pTargetCh2], %[str4Offs]] \n"
|
|
/* blend ch1 vector with input ch1 color*/
|
|
" vmla.u16 q3, q1, %[c1] \n"
|
|
" vshr.u16 q3, q3, #8 \n"
|
|
" vstrb.u16 q3, [%[pTargetCh1], %[str4Offs]] \n"
|
|
|
|
" adds %[pTargetCh0], #32 \n"
|
|
" adds %[pTargetCh1], #32 \n"
|
|
|
|
/* scale ch2 vector with alpha vector */
|
|
" vmul.u16 q3, q0, q2 \n"
|
|
" vldrb.u16 q0, [%[pTargetCh0], %[str4Offs]] \n"
|
|
|
|
/* blend ch2 vector with input ch2 color*/
|
|
" vmla.u16 q3, q1, %[c2] \n"
|
|
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
|
|
" vmulh.u8 q1, q1, %[vOpacity] \n"
|
|
|
|
" vshr.u16 q3, q3, #8 \n"
|
|
" vstrb.u16 q3, [%[pTargetCh2], %[str4Offs]] \n"
|
|
|
|
" add.w %[pTargetCh2], %[pTargetCh2], #32 \n"
|
|
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
:[pTargetCh0] "+r"(pTargetCh0), [pTargetCh1] "+r"(pTargetCh1),
|
|
[pTargetCh2] "+r"(pTargetCh2), [pAlpha] "+r" (pAlpha), [loopCnt] "+r"(blkCnt)
|
|
:[vec256] "t" (v256),[str4Offs] "t" (vStride4Offs),
|
|
[vOpacity] "t"(vOpacity),
|
|
[c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2)
|
|
:"q0", "q1", "q2", "q3", "memory");
|
|
|
|
#endif
|
|
pchAlpha += (iAlphaStride);
|
|
pTarget += (iTargetStride);
|
|
}
|
|
}
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_cccn888_colour_filling_channel_mask(uint32_t * __RESTRICT pTarget,
|
|
int16_t iTargetStride,
|
|
uint32_t * __RESTRICT pwAlpha,
|
|
int16_t iAlphaStride,
|
|
arm_2d_size_t * __RESTRICT ptCopySize,
|
|
uint32_t Colour)
|
|
{
|
|
int_fast16_t iHeight = ptCopySize->iHeight;
|
|
int_fast16_t iWidth = ptCopySize->iWidth;
|
|
uint16x8_t v256 = vdupq_n_u16(256);
|
|
uint16x8_t vStride4Offs = vidupq_n_u16(0, 4);
|
|
uint16_t c0, c1, c2;
|
|
|
|
c0 = Colour & 0xff;
|
|
c1 = (Colour >> 8) & 0xff;
|
|
c2 = (Colour >> 16) & 0xff;
|
|
|
|
for (int_fast16_t y = 0; y < iHeight; y++) {
|
|
const uint8_t *pAlpha = (const uint8_t *)pwAlpha;
|
|
uint8_t * pTargetCh0 = (uint8_t*)pTarget;
|
|
uint8_t * pTargetCh1 = pTargetCh0 + 1;
|
|
uint8_t * pTargetCh2 = pTargetCh0 + 2;
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
|
|
CCCN888_COLOUR_FILLING_MASK_INNER_MVE(CCCN888_TRGT_LOAD_STRIDE, vStride4Offs,
|
|
CCCN888_SCAL_OPACITY_NONE, _, 4);
|
|
#else
|
|
|
|
register unsigned blkCnt __asm("lr");
|
|
blkCnt = iWidth;
|
|
|
|
__asm volatile(
|
|
".p2align 2 \n"
|
|
/* expand chan0 */
|
|
" vldrb.u16 q0, [%[pTargetCh0], %[str4Offs]] \n"
|
|
" vldrb.u16 q1, [%[pAlpha], %[str4Offs]] \n"
|
|
|
|
" wlstp.16 lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
|
|
" vsub.i16 q2, %[vec256], q1 \n"
|
|
|
|
/* scale ch0 vector with alpha vector */
|
|
" vmul.u16 q3, q0, q2 \n"
|
|
|
|
/* expand chan1 */
|
|
" vldrb.u16 q0, [%[pTargetCh1], %[str4Offs]] \n"
|
|
/* blend ch0 vector with input ch0 color*/
|
|
" vmla.u16 q3, q1, %[c0] \n"
|
|
" vshr.u16 q3, q3, #8 \n"
|
|
|
|
" vstrb.u16 q3, [%[pTargetCh0], %[str4Offs]] \n"
|
|
|
|
/* scale ch1 vector with alpha vector */
|
|
" vmul.u16 q3, q0, q2 \n"
|
|
|
|
/* expand chan2 */
|
|
" vldrb.u16 q0, [%[pTargetCh2], %[str4Offs]] \n"
|
|
/* blend ch1 vector with input ch1 color*/
|
|
" vmla.u16 q3, q1, %[c1] \n"
|
|
" vshr.u16 q3, q3, #8 \n"
|
|
" vstrb.u16 q3, [%[pTargetCh1], %[str4Offs]] \n"
|
|
|
|
" adds %[pAlpha], #32 \n"
|
|
" adds %[pTargetCh0], #32 \n"
|
|
|
|
|
|
/* scale ch2 vector with alpha vector */
|
|
" vmul.u16 q3, q0, q2 \n"
|
|
" vldrb.u16 q0, [%[pTargetCh0], %[str4Offs]] \n"
|
|
|
|
/* blend ch2 vector with input ch2 color*/
|
|
" vmla.u16 q3, q1, %[c2] \n"
|
|
" vldrb.u16 q1, [%[pAlpha], %[str4Offs]] \n"
|
|
|
|
" vshr.u16 q3, q3, #8 \n"
|
|
" vstrb.u16 q3, [%[pTargetCh2], %[str4Offs]] \n"
|
|
|
|
" adds %[pTargetCh1], #32 \n"
|
|
" adds %[pTargetCh2], #32 \n"
|
|
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
:[pTargetCh0] "+r"(pTargetCh0), [pTargetCh1] "+r"(pTargetCh1),
|
|
[pTargetCh2] "+r"(pTargetCh2), [pAlpha] "+r" (pAlpha), [loopCnt] "+r"(blkCnt)
|
|
:[vec256] "t" (v256),[str4Offs] "t" (vStride4Offs),
|
|
[c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2)
|
|
:"q0", "q1", "q2", "q3", "memory");
|
|
|
|
#endif
|
|
pwAlpha += (iAlphaStride);
|
|
pTarget += (iTargetStride);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_cccn888_colour_filling_channel_mask_opacity(uint32_t * __RESTRICT pTarget,
|
|
int16_t iTargetStride,
|
|
uint32_t * __RESTRICT pwAlpha,
|
|
int16_t iAlphaStride,
|
|
arm_2d_size_t *
|
|
__RESTRICT ptCopySize,
|
|
uint32_t Colour, uint8_t chOpacity)
|
|
{
|
|
int_fast16_t iHeight = ptCopySize->iHeight;
|
|
int_fast16_t iWidth = ptCopySize->iWidth;
|
|
uint16x8_t v256 = vdupq_n_u16(256);
|
|
uint16x8_t vStride4Offs = vidupq_n_u16(0, 4);
|
|
uint8x16_t vOpacity = vdupq_n_u8(chOpacity);
|
|
uint16_t c0, c1, c2;
|
|
|
|
c0 = Colour & 0xff;
|
|
c1 = (Colour >> 8) & 0xff;
|
|
c2 = (Colour >> 16) & 0xff;
|
|
|
|
for (int_fast16_t y = 0; y < iHeight; y++) {
|
|
const uint8_t *pAlpha = (const uint8_t *)pwAlpha;
|
|
uint8_t * pTargetCh0 = (uint8_t*)pTarget;
|
|
uint8_t * pTargetCh1 = pTargetCh0 + 1;
|
|
uint8_t * pTargetCh2 = pTargetCh0 + 2;
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
|
|
CCCN888_COLOUR_FILLING_MASK_INNER_MVE(CCCN888_TRGT_LOAD_STRIDE, vStride4Offs,
|
|
CCCN888_SCAL_OPACITY, vOpacity, 4);
|
|
#else
|
|
|
|
register unsigned blkCnt __asm("lr");
|
|
blkCnt = iWidth;
|
|
|
|
__asm volatile(
|
|
".p2align 2 \n"
|
|
/* expand chan0 */
|
|
" vldrb.u16 q0, [%[pTargetCh0], %[str4Offs]] \n"
|
|
" vldrb.u16 q1, [%[pAlpha], %[str4Offs]] \n"
|
|
" vmulh.u8 q1, q1, %[vOpacity] \n"
|
|
|
|
" wlstp.16 lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
|
|
" vsub.i16 q2, %[vec256], q1 \n"
|
|
|
|
/* scale ch0 vector with alpha vector */
|
|
" vmul.u16 q3, q0, q2 \n"
|
|
|
|
/* expand chan1 */
|
|
" vldrb.u16 q0, [%[pTargetCh1], %[str4Offs]] \n"
|
|
/* blend ch0 vector with input ch0 color*/
|
|
" vmla.u16 q3, q1, %[c0] \n"
|
|
" vshr.u16 q3, q3, #8 \n"
|
|
|
|
" vstrb.u16 q3, [%[pTargetCh0], %[str4Offs]] \n"
|
|
|
|
/* scale ch1 vector with alpha vector */
|
|
" vmul.u16 q3, q0, q2 \n"
|
|
|
|
/* expand chan2 */
|
|
" vldrb.u16 q0, [%[pTargetCh2], %[str4Offs]] \n"
|
|
/* blend ch1 vector with input ch1 color*/
|
|
" vmla.u16 q3, q1, %[c1] \n"
|
|
" vshr.u16 q3, q3, #8 \n"
|
|
" vstrb.u16 q3, [%[pTargetCh1], %[str4Offs]] \n"
|
|
|
|
" adds %[pAlpha], #32 \n"
|
|
" adds %[pTargetCh0], #32 \n"
|
|
|
|
|
|
/* scale ch2 vector with alpha vector */
|
|
" vmul.u16 q3, q0, q2 \n"
|
|
" vldrb.u16 q0, [%[pTargetCh0], %[str4Offs]] \n"
|
|
|
|
/* blend ch2 vector with input ch2 color*/
|
|
" vmla.u16 q3, q1, %[c2] \n"
|
|
" vldrb.u16 q1, [%[pAlpha], %[str4Offs]] \n"
|
|
" vmulh.u8 q1, q1, %[vOpacity] \n"
|
|
|
|
" vshr.u16 q3, q3, #8 \n"
|
|
" vstrb.u16 q3, [%[pTargetCh2], %[str4Offs]] \n"
|
|
|
|
" adds %[pTargetCh1], #32 \n"
|
|
" adds %[pTargetCh2], #32 \n"
|
|
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
:[pTargetCh0] "+r"(pTargetCh0), [pTargetCh1] "+r"(pTargetCh1),
|
|
[pTargetCh2] "+r"(pTargetCh2), [pAlpha] "+r" (pAlpha), [loopCnt] "+r"(blkCnt)
|
|
:[vec256] "t" (v256),[str4Offs] "t" (vStride4Offs), [vOpacity] "t"(vOpacity),
|
|
[c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2)
|
|
:"q0", "q1", "q2", "q3", "memory");
|
|
|
|
#endif
|
|
pwAlpha += (iAlphaStride);
|
|
pTarget += (iTargetStride);
|
|
}
|
|
}
|
|
|
|
|
|
/*----------------------------------------------------------------------------*
|
|
* Convert Colour format *
|
|
*----------------------------------------------------------------------------*/
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_cccn888_to_rgb565(uint32_t *__RESTRICT pwSourceBase,
|
|
int16_t iSourceStride,
|
|
uint16_t *__RESTRICT phwTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *__RESTRICT ptCopySize)
|
|
{
|
|
int32_t blkCnt;
|
|
uint32x4_t maskR = vdupq_n_u32(0x001f);
|
|
uint32x4_t maskG = vdupq_n_u32(0x07e0);
|
|
uint32x4_t maskB = vdupq_n_u32(0xf800);
|
|
|
|
for (int_fast16_t y = 0; y < ptCopySize->iHeight; y++) {
|
|
const uint32_t *pSource = pwSourceBase;
|
|
uint16_t *pTarget = phwTargetBase;
|
|
|
|
blkCnt = ptCopySize->iWidth;
|
|
#ifdef USE_MVE_INTRINSICS
|
|
do {
|
|
mve_pred16_t tailPred = vctp32q(blkCnt);
|
|
|
|
/* load a vector of 4 cccn888 pixels */
|
|
uint32x4_t vecIn = vld1q_z(pSource, tailPred);
|
|
/* extract individual channels and place them according bit position */
|
|
uint32x4_t vecR = (vecIn >> 3) & maskR;
|
|
uint32x4_t vecG = (vecIn >> 5) & maskG;
|
|
uint32x4_t vecB = (vecIn >> 8) & maskB;
|
|
/* merge */
|
|
uint32x4_t vOut = vecR | vecG | vecB;
|
|
|
|
/* store a vector of 4 rgb565 pixels */
|
|
vstrhq_p_u32(pTarget, vOut, tailPred);
|
|
|
|
pSource += 4;
|
|
pTarget += 4;
|
|
blkCnt -= 4;
|
|
}
|
|
while (blkCnt > 0);
|
|
#else
|
|
const int32_t inv_2pow3 = 1 << (31-3); /*1/2^3 in Q.31 */
|
|
const int32_t inv_2pow5 = 1 << (31-5); /*1/2^5 in Q.31 */
|
|
const int32_t inv_2pow8 = 1 << (31-8); /*1/2^8 in Q.31 */
|
|
|
|
__asm volatile(
|
|
" wlstp.32 lr, %[loopCnt], 1f \n"
|
|
/* precompute for allowing filling stalls in the inner loop */
|
|
/* use vqdmulh to replace shifts to allow overlap with 'AND' */
|
|
|
|
/* load a vector of 4 cccn888 pixels */
|
|
" vldrw.u32 q0, [%[pSource]], #16 \n"
|
|
/* mimic right shift by 3 */
|
|
" vqdmulh.s32 q1, q0, %[inv_2pow3] \n"
|
|
|
|
".p2align 2 \n"
|
|
"2: \n"
|
|
" vand q1, q1, %[maskR] \n"
|
|
/* mimic right shift by 5 */
|
|
" vqdmulh.s32 q2, q0, %[inv_2pow5] \n"
|
|
" vand q2, q2, %[maskG] \n"
|
|
/* mimic right shift by 8 */
|
|
" vqdmulh.s32 q3, q0, %[inv_2pow8] \n"
|
|
/* accumulate R & G */
|
|
" vorr q2, q1, q2 \n"
|
|
/* load next vector of 4 cccn888 pixels */
|
|
" vldrw.u32 q0, [%[pSource]], #16 \n"
|
|
" vand q3, q3, %[maskB] \n"
|
|
/* mimic right shift by 3 */
|
|
" vqdmulh.s32 q1, q0, %[inv_2pow3] \n"
|
|
/* accumulate B */
|
|
" vorr q2, q2, q3 \n"
|
|
/* store a vector of 4 rgb565 pixels */
|
|
" vstrh.32 q2, [%[pTarget]], #8 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [pSource] "+r"(pSource), [pTarget] "+r" (pTarget)
|
|
: [loopCnt] "r"(blkCnt), [inv_2pow3] "r" (inv_2pow3),
|
|
[inv_2pow5] "r" (inv_2pow5), [inv_2pow8] "r" (inv_2pow8),
|
|
[maskR] "t" (maskR),[maskG] "t" (maskG),[maskB] "t" (maskB)
|
|
: "q0", "q1", "q2", "q3", "memory", "r14" );
|
|
#endif
|
|
|
|
pwSourceBase += iSourceStride;
|
|
phwTargetBase += iTargetStride;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_rgb565_to_cccn888(uint16_t *__RESTRICT phwSourceBase,
|
|
int16_t iSourceStride,
|
|
uint32_t *__RESTRICT pwTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *__RESTRICT ptCopySize)
|
|
{
|
|
int32_t blkCnt;
|
|
uint32x4_t maskRB = vdupq_n_u32(0xf8);
|
|
uint32x4_t maskG = vdupq_n_u32(0xfc00);
|
|
|
|
|
|
for (int_fast16_t y = 0; y < ptCopySize->iHeight; y++) {
|
|
|
|
const uint16_t *__RESTRICT phwSource = phwSourceBase;
|
|
uint32_t *__RESTRICT pwTarget = pwTargetBase;
|
|
|
|
blkCnt = ptCopySize->iWidth;
|
|
#ifdef USE_MVE_INTRINSICS
|
|
do {
|
|
mve_pred16_t tailPred = vctp32q(blkCnt);
|
|
|
|
/* load a vector of 4 rgb565 pixels */
|
|
uint32x4_t vecIn = vldrhq_z_u32(phwSource, tailPred);
|
|
/* extract individual channels and place them according position */
|
|
uint32x4_t vecR = (vecIn << 3) & maskRB;
|
|
uint32x4_t vecG = (vecIn << 5) & maskG;
|
|
uint32x4_t vecB = ((vecIn >> 8) & maskRB) << 16;
|
|
/* merge and set n channel to 0xff */
|
|
uint32x4_t vOut = 0xff000000 | vecR | vecG | vecB;
|
|
|
|
/* store a vector of 4 cccn888 pixels */
|
|
vst1q_p(pwTarget, vOut, tailPred);
|
|
|
|
phwSource += 4;
|
|
pwTarget += 4;
|
|
blkCnt -= 4;
|
|
}
|
|
while (blkCnt > 0);
|
|
|
|
#else
|
|
__asm volatile(
|
|
" wlstp.32 lr, %[loopCnt], 1f \n"
|
|
/* precompute for allowing filling stalls in the inner loop */
|
|
/* use vqdmulh & vmul to replace shifts to allow overlap with 'AND' */
|
|
|
|
/* load a vector of 4 rgb565 pixels */
|
|
" vldrh.u32 q0, [%[pSource]], #8 \n"
|
|
/* mimic left shift by 3 */
|
|
" vmul.u32 q1, q0, %[two_pow3] \n"
|
|
".p2align 2 \n"
|
|
"2: \n"
|
|
/* mimic left shift by 5 */
|
|
" vmul.u32 q2, q0, %[two_pow5] \n"
|
|
" vand q1, q1, %[maskRB] \n"
|
|
/* mimic right shift by 8 */
|
|
" vqdmulh.s32 q3, q0, %[inv_2pow8] \n"
|
|
" vand q2, q2, %[maskG] \n"
|
|
/* accumulate G & R, use vmla instead of vorr for best overlap */
|
|
" vmla.u32 q2, q1, %[one] \n"
|
|
" vand q3, q3, %[maskRB] \n"
|
|
/* accumulate B + left shift by 16 */
|
|
" vmla.u32 q2, q3, %[two_pow16] \n"
|
|
/* load next vector of 4 rgb565 pixels */
|
|
" vldrh.u32 q0, [%[pSource]], #8 \n"
|
|
/* merge and set n channel to 0xff */
|
|
" vorr.i32 q2, #0xff000000 \n"
|
|
/* mimic left shift by 3 */
|
|
" vmul.u32 q1, q0, %[two_pow3] \n"
|
|
/* store a vector of 4 cccn888 pixels */
|
|
" vstrw.32 q2, [%[pTarget]], #16 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [pSource] "+r"(phwSource), [pTarget] "+r" (pwTarget)
|
|
: [loopCnt] "r"(blkCnt),[two_pow3] "r" (1<<3), [two_pow5] "r" (1<<5),
|
|
[two_pow16] "r" (1<<16),[inv_2pow8] "r" (1 << (31-8)),
|
|
[maskRB] "t" (maskRB),[maskG] "t" (maskG), [one] "r" (1)
|
|
: "q0", "q1", "q2", "q3", "memory", "r14" );
|
|
#endif
|
|
|
|
phwSourceBase += iSourceStride;
|
|
pwTargetBase += iTargetStride;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void __arm_2d_impl_rgb565_masks_fill(uint16_t * __RESTRICT ptSourceBase,
|
|
int16_t iSourceStride,
|
|
arm_2d_size_t * __RESTRICT ptSourceSize,
|
|
uint8_t * __RESTRICT pchSourceMaskBase,
|
|
int16_t iSourceMaskStride,
|
|
arm_2d_size_t * __RESTRICT ptSourceMaskSize,
|
|
uint16_t * __RESTRICT ptTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t * __RESTRICT ptTargetSize,
|
|
uint8_t * __RESTRICT pchTargetMaskBase,
|
|
int16_t iTargetMaskStride,
|
|
arm_2d_size_t * __RESTRICT ptTargetMaskSize)
|
|
{
|
|
uint8_t *__RESTRICT pchTargetMaskLineBase = pchTargetMaskBase;
|
|
uint16x8_t v256 = vdupq_n_u16(256);
|
|
|
|
#ifndef USE_MVE_INTRINSICS
|
|
uint16x8_t scratch[5];
|
|
|
|
/* vector of 256 avoiding use of vdup to increase overlap efficiency*/
|
|
vst1q((uint16_t *) & scratch[0], v256);
|
|
/* scratch[1] is temporary for blended Red chan. vector */
|
|
|
|
/* Unpacking Mask Red */
|
|
vst1q((uint16_t *) & scratch[2], vdupq_n_u16(0x00fc));
|
|
/* B channel packing mask */
|
|
vst1q((uint16_t *) & scratch[3], vdupq_n_u16(0xf800));
|
|
/* G channel packing Mask */
|
|
vst1q((uint16_t *) & scratch[4], vdupq_n_u16(0x07e0));
|
|
|
|
/* use of fixed point mult instead of vshr to increase overlap efficiency */
|
|
const int16_t inv_2pow3 = 1 << (15 - 3); /* 1/(2^3) in Q.15 */
|
|
#endif
|
|
|
|
for (int_fast16_t iTargetY = 0; iTargetY < ptTargetSize->iHeight;) {
|
|
uint16_t *__RESTRICT ptSource = ptSourceBase;
|
|
uint8_t *pchSourceMask = pchSourceMaskBase;
|
|
#if __API_CAFWM_CFG_SUPPORT_SRC_MSK_WRAPING
|
|
int_fast16_t iSourceMaskY = 0;
|
|
#endif
|
|
|
|
for (int_fast16_t iSourceY = 0; iSourceY < ptSourceSize->iHeight; iSourceY++) {
|
|
uint16_t *__RESTRICT ptTarget = ptTargetBase;
|
|
uint8_t *__RESTRICT pchTargetMask = pchTargetMaskLineBase;
|
|
uint_fast32_t wLengthLeft = ptTargetSize->iWidth;
|
|
|
|
do {
|
|
uint_fast32_t wLength = MIN(wLengthLeft, ptSourceSize->iWidth);
|
|
uint16_t *__RESTRICT ptSrc = ptSource;
|
|
uint8_t *__RESTRICT pchSrcMsk = pchSourceMask;
|
|
uint16_t *__RESTRICT ptTargetCur = ptTarget;
|
|
uint8_t *__RESTRICT pchTargetMaskCur = pchTargetMask;
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
int32_t blkCnt = wLength;
|
|
|
|
do {
|
|
uint16x8_t vecTarget = vld1q(ptTargetCur);
|
|
uint16x8_t vecSource = vld1q(ptSrc);
|
|
uint16x8_t vecSrcMsk = vldrbq_u16(pchSrcMsk);
|
|
uint16x8_t vecTargetMask = vldrbq_u16(pchTargetMaskCur);
|
|
uint16x8_t vecHwOpacity =
|
|
vsubq_u16(v256, (vecSrcMsk * vecTargetMask) >> 8);
|
|
|
|
vecTarget = __arm_2d_rgb565_blending_opacity_single_vec(
|
|
vecTarget, vecSource, vecHwOpacity);
|
|
/* tail predication */
|
|
vst1q_p_u16(ptTargetCur, vecTarget, vctp16q(blkCnt));
|
|
|
|
pchSrcMsk += 8;
|
|
pchTargetMaskCur += 8;
|
|
ptTargetCur += 8;
|
|
ptSrc += 8;
|
|
|
|
blkCnt -= 8;
|
|
}
|
|
while (blkCnt > 0);
|
|
|
|
#else
|
|
register unsigned blkCnt __asm("lr");
|
|
blkCnt = wLength;
|
|
|
|
__asm volatile (
|
|
/* R & B mask */
|
|
"vecRBUnpackMask .req q7 \n"
|
|
"vecAlpha .req q5 \n"
|
|
"vecHwOpacity .req q3 \n"
|
|
|
|
/* preload */
|
|
" vldrb.u16 q0, [%[pchSrcMsk]], #8 \n"
|
|
" vmov.i16 vecRBUnpackMask, #0x00f8 \n"
|
|
" vldrb.u16 q5, [%[pchTargetMask]], #8 \n"
|
|
|
|
".p2align 2 \n"
|
|
" wlstp.16 lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
/* vecSrcMsk * vecTargetMask */
|
|
" vmul.i16 q0, q5, q0 \n"
|
|
" vldrh.u16 q6, [%[ptTarget]] \n"
|
|
" vshr.u16 vecAlpha, q0, #8 \n"
|
|
/* 256-dup vector */
|
|
" vldrh.u16 q1, [%[scratch], #(16*0)] \n"
|
|
/* vecHwOpacity =
|
|
vsubq_u16(v256, (vecSrcMsk * vecTargetMask) >> 8) */
|
|
" vsub.i16 vecHwOpacity, q1, vecAlpha \n"
|
|
" vldrh.u16 q1, [%[ptSrc]], #16 \n"
|
|
/* mimic vshl #3 */
|
|
" vshl.u16 q0, q6, #3 \n"
|
|
" vmul.i16 q4, q1, %[eight] \n"
|
|
/* vecR extract and scale */
|
|
" vand q0, q0, vecRBUnpackMask \n"
|
|
" vmul.i16 q0, vecHwOpacity, q0 \n"
|
|
/* vecSrcR extract and scale */
|
|
" vand q4, q4, vecRBUnpackMask \n"
|
|
" vmul.i16 q4, vecAlpha, q4 \n"
|
|
/* 0xfc G-mask */
|
|
" vldrw.u32 q2, [%[scratch], #(16*2)] \n"
|
|
" vadd.i16 q4, q0, q4 \n"
|
|
/* push blended R */
|
|
" vstrw.32 q4, [%[scratch], #(16*1)] \n"
|
|
/* mimic vshr.u16 q4, q6, #3 */
|
|
" vqdmulh.s16 q4, q6, %[inv_2pow3] \n"
|
|
" vshr.u16 q0, q1, #3 \n"
|
|
/* vecG extract and scale */
|
|
" vand q4, q4, q2 \n"
|
|
" vmul.i16 q4, vecHwOpacity, q4 \n"
|
|
/* vecSrcG extract and scale */
|
|
" vand q0, q0, q2 \n"
|
|
" vmul.i16 q2, vecAlpha, q0 \n"
|
|
" vshr.u16 q0, q1, #8 \n"
|
|
/* blended G */
|
|
/* vadd.i16 q2, q4, q2
|
|
addition using vmla for more efficient overlap */
|
|
" vmla.u16 q2, q4, %[one] \n"
|
|
/* vecB extract and scale */
|
|
" vshr.u16 q4, q6, #8 \n"
|
|
" vand q4, q4, vecRBUnpackMask \n"
|
|
" vmul.i16 q4, vecHwOpacity, q4 \n"
|
|
/* vecSrcB extract and scale */
|
|
" vand q0, q0, vecRBUnpackMask \n"
|
|
" vmul.i16 q0, vecAlpha, q0 \n"
|
|
|
|
".unreq vecAlpha \n"
|
|
".unreq vecHwOpacity \n"
|
|
".unreq vecRBUnpackMask \n"
|
|
|
|
/* reload blended R */
|
|
" vldrw.u32 q1, [%[scratch], #(16*1)] \n"
|
|
/* blended B
|
|
vadd.i16 q0, q4, q0
|
|
addition using vmla for more efficient overlap */
|
|
" vmla.u16 q0, q4, %[one] \n"
|
|
/* pack R */
|
|
" vshr.u16 q3, q1, #11 \n"
|
|
/* B channel packing mask 0xf800 */
|
|
" vldrw.u32 q4, [%[scratch], #(16*3)] \n"
|
|
" vand q0, q0, q4 \n"
|
|
/* accumulate R & B */
|
|
" vorr q4, q0, q3 \n"
|
|
/* G channel packing mask 0x07e0 */
|
|
" vldrw.u32 q3, [%[scratch], #(16*4)] \n"
|
|
" vshr.u16 q2, q2, #5 \n"
|
|
/* load next source mask */
|
|
" vldrb.u16 q0, [%[pchSrcMsk]], #8 \n"
|
|
/* G channel masking */
|
|
" vand q2, q2, q3 \n"
|
|
/* load next target mask */
|
|
" vldrb.u16 q5, [%[pchTargetMask]], #8 \n"
|
|
/* pack G with R.B */
|
|
" vorr q4, q4, q2 \n"
|
|
" vstrh.16 q4, [%[ptTarget]], #16 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
:[ptTarget] "+r"(ptTargetCur),[ptSrc] "+r"(ptSrc),
|
|
[pchTargetMask] "+l"(pchTargetMaskCur),[pchSrcMsk] "+l"(pchSrcMsk),
|
|
[loopCnt] "+r"(blkCnt)
|
|
:[scratch] "r" (scratch),[eight] "r"(8),[inv_2pow3] "r"(inv_2pow3),
|
|
[one] "r" (1)
|
|
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory");
|
|
|
|
#endif
|
|
ptTarget += wLength;
|
|
pchTargetMask += wLength;
|
|
|
|
wLengthLeft -= wLength;
|
|
} while (wLengthLeft);
|
|
|
|
ptSource += iSourceStride;
|
|
ptTargetBase += iTargetStride;
|
|
#if __API_CAFWM_CFG_SUPPORT_SRC_MSK_WRAPING
|
|
iSourceMaskY++;
|
|
//! handle source mask
|
|
if ( (iSourceMaskY >= ptSourceMaskSize->iHeight)
|
|
|| (iSourceMaskY >= ptSourceSize->iHeight)) {
|
|
pchSourceMask = pchSourceMaskBase;
|
|
iSourceMaskY = 0;
|
|
} else {
|
|
pchSourceMask += iSourceMaskStride;
|
|
}
|
|
#else
|
|
pchSourceMask += iSourceMaskStride;
|
|
#endif
|
|
|
|
pchTargetMaskLineBase += iTargetMaskStride;
|
|
|
|
iTargetY++;
|
|
if (iTargetY >= ptTargetSize->iHeight) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_rgb565_src_msk_1h_des_msk_fill(
|
|
uint16_t * __RESTRICT ptSourceBase,
|
|
int16_t iSourceStride,
|
|
arm_2d_size_t * __RESTRICT ptSourceSize,
|
|
uint8_t * __RESTRICT pchSourceMaskBase,
|
|
int16_t iSourceMaskStride,
|
|
arm_2d_size_t * __RESTRICT ptSourceMaskSize,
|
|
uint16_t * __RESTRICT ptTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t * __RESTRICT ptTargetSize,
|
|
uint8_t * __RESTRICT pchTargetMaskBase,
|
|
int16_t iTargetMaskStride,
|
|
arm_2d_size_t * __RESTRICT ptTargetMaskSize)
|
|
{
|
|
uint8_t *__RESTRICT pchTargetMaskLineBase = pchTargetMaskBase;
|
|
uint16x8_t v256 = vdupq_n_u16(256);
|
|
|
|
#ifndef USE_MVE_INTRINSICS
|
|
uint16x8_t scratch[5];
|
|
|
|
/* vector of 256 avoiding use of vdup to increase overlap efficiency*/
|
|
vst1q((uint16_t *) & scratch[0], v256);
|
|
/* scratch[1] is temporary for blended Red chan. vector */
|
|
|
|
/* Unpacking Mask Red */
|
|
vst1q((uint16_t *) & scratch[2], vdupq_n_u16(0x00fc));
|
|
/* B channel packing mask */
|
|
vst1q((uint16_t *) & scratch[3], vdupq_n_u16(0xf800));
|
|
/* G channel packing Mask */
|
|
vst1q((uint16_t *) & scratch[4], vdupq_n_u16(0x07e0));
|
|
|
|
/* use of fixed point mult instead of vshr to increase overlap efficiency */
|
|
const int16_t inv_2pow3 = 1 << (15 - 3); /* 1/(2^3) in Q.15 */
|
|
#endif
|
|
|
|
for (int_fast16_t iTargetY = 0; iTargetY < ptTargetSize->iHeight;) {
|
|
uint16_t *__RESTRICT ptSource = ptSourceBase;
|
|
uint8_t *pchSourceMask = pchSourceMaskBase;
|
|
#if __API_CAFWM_CFG_SUPPORT_SRC_MSK_WRAPING
|
|
int_fast16_t iSourceMaskY = 0;
|
|
#endif
|
|
|
|
for (int_fast16_t iSourceY = 0; iSourceY < ptSourceSize->iHeight; iSourceY++) {
|
|
uint16_t *__RESTRICT ptTarget = ptTargetBase;
|
|
uint8_t *__RESTRICT pchTargetMask = pchTargetMaskLineBase;
|
|
uint_fast32_t wLengthLeft = ptTargetSize->iWidth;
|
|
|
|
do {
|
|
uint_fast32_t wLength = MIN(wLengthLeft, ptSourceSize->iWidth);
|
|
uint16_t *__RESTRICT ptSrc = ptSource;
|
|
uint8_t *__RESTRICT pchSrcMsk = pchSourceMask;
|
|
uint16_t *__RESTRICT ptTargetCur = ptTarget;
|
|
uint8_t *__RESTRICT pchTargetMaskCur = pchTargetMask;
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
int32_t blkCnt = wLength;
|
|
|
|
do {
|
|
uint16x8_t vecTarget = vld1q(ptTargetCur);
|
|
uint16x8_t vecSource = vld1q(ptSrc);
|
|
uint16x8_t vecSrcMsk = vldrbq_u16(pchSrcMsk);
|
|
uint16x8_t vecTargetMask = vldrbq_u16(pchTargetMaskCur);
|
|
uint16x8_t vecHwOpacity =
|
|
vsubq_u16(v256, (vecSrcMsk * vecTargetMask) >> 8);
|
|
|
|
vecTarget = __arm_2d_rgb565_blending_opacity_single_vec(
|
|
vecTarget, vecSource, vecHwOpacity);
|
|
/* tail predication */
|
|
vst1q_p_u16(ptTargetCur, vecTarget, vctp16q(blkCnt));
|
|
|
|
pchSrcMsk += 8;
|
|
pchTargetMaskCur += 8;
|
|
ptTargetCur += 8;
|
|
ptSrc += 8;
|
|
|
|
blkCnt -= 8;
|
|
}
|
|
while (blkCnt > 0);
|
|
|
|
#else
|
|
register unsigned blkCnt __asm("lr");
|
|
blkCnt = wLength;
|
|
|
|
__asm volatile (
|
|
/* R & B mask */
|
|
"vecRBUnpackMask .req q7 \n"
|
|
"vecAlpha .req q5 \n"
|
|
"vecHwOpacity .req q3 \n"
|
|
|
|
/* preload */
|
|
" vldrb.u16 q0, [%[pchSrcMsk]], #8 \n"
|
|
" vmov.i16 vecRBUnpackMask, #0x00f8 \n"
|
|
" vldrb.u16 q5, [%[pchTargetMask]], #8 \n"
|
|
|
|
".p2align 2 \n"
|
|
" wlstp.16 lr, %[loopCnt], 1f \n"
|
|
"2: \n"
|
|
/* vecSrcMsk * vecTargetMask */
|
|
" vmul.i16 q0, q5, q0 \n"
|
|
" vldrh.u16 q6, [%[ptTarget]] \n"
|
|
" vshr.u16 vecAlpha, q0, #8 \n"
|
|
/* 256-dup vector */
|
|
" vldrh.u16 q1, [%[scratch], #(16*0)] \n"
|
|
/* vecHwOpacity =
|
|
vsubq_u16(v256, (vecSrcMsk * vecTargetMask) >> 8) */
|
|
" vsub.i16 vecHwOpacity, q1, vecAlpha \n"
|
|
" vldrh.u16 q1, [%[ptSrc]], #16 \n"
|
|
/* mimic vshl #3 */
|
|
" vshl.u16 q0, q6, #3 \n"
|
|
" vmul.i16 q4, q1, %[eight] \n"
|
|
/* vecR extract and scale */
|
|
" vand q0, q0, vecRBUnpackMask \n"
|
|
" vmul.i16 q0, vecHwOpacity, q0 \n"
|
|
/* vecSrcR extract and scale */
|
|
" vand q4, q4, vecRBUnpackMask \n"
|
|
" vmul.i16 q4, vecAlpha, q4 \n"
|
|
/* 0xfc G-mask */
|
|
" vldrw.u32 q2, [%[scratch], #(16*2)] \n"
|
|
" vadd.i16 q4, q0, q4 \n"
|
|
/* push blended R */
|
|
" vstrw.32 q4, [%[scratch], #(16*1)] \n"
|
|
/* mimic vshr.u16 q4, q6, #3 */
|
|
" vqdmulh.s16 q4, q6, %[inv_2pow3] \n"
|
|
" vshr.u16 q0, q1, #3 \n"
|
|
/* vecG extract and scale */
|
|
" vand q4, q4, q2 \n"
|
|
" vmul.i16 q4, vecHwOpacity, q4 \n"
|
|
/* vecSrcG extract and scale */
|
|
" vand q0, q0, q2 \n"
|
|
" vmul.i16 q2, vecAlpha, q0 \n"
|
|
" vshr.u16 q0, q1, #8 \n"
|
|
/* blended G */
|
|
/* vadd.i16 q2, q4, q2
|
|
addition using vmla for more efficient overlap */
|
|
" vmla.u16 q2, q4, %[one] \n"
|
|
/* vecB extract and scale */
|
|
" vshr.u16 q4, q6, #8 \n"
|
|
" vand q4, q4, vecRBUnpackMask \n"
|
|
" vmul.i16 q4, vecHwOpacity, q4 \n"
|
|
/* vecSrcB extract and scale */
|
|
" vand q0, q0, vecRBUnpackMask \n"
|
|
" vmul.i16 q0, vecAlpha, q0 \n"
|
|
|
|
".unreq vecAlpha \n"
|
|
".unreq vecHwOpacity \n"
|
|
".unreq vecRBUnpackMask \n"
|
|
|
|
/* reload blended R */
|
|
" vldrw.u32 q1, [%[scratch], #(16*1)] \n"
|
|
/* blended B
|
|
vadd.i16 q0, q4, q0
|
|
addition using vmla for more efficient overlap */
|
|
" vmla.u16 q0, q4, %[one] \n"
|
|
/* pack R */
|
|
" vshr.u16 q3, q1, #11 \n"
|
|
/* B channel packing mask 0xf800 */
|
|
" vldrw.u32 q4, [%[scratch], #(16*3)] \n"
|
|
" vand q0, q0, q4 \n"
|
|
/* accumulate R & B */
|
|
" vorr q4, q0, q3 \n"
|
|
/* G channel packing mask 0x07e0 */
|
|
" vldrw.u32 q3, [%[scratch], #(16*4)] \n"
|
|
" vshr.u16 q2, q2, #5 \n"
|
|
/* load next source mask */
|
|
" vldrb.u16 q0, [%[pchSrcMsk]], #8 \n"
|
|
/* G channel masking */
|
|
" vand q2, q2, q3 \n"
|
|
/* load next target mask */
|
|
" vldrb.u16 q5, [%[pchTargetMask]], #8 \n"
|
|
/* pack G with R.B */
|
|
" vorr q4, q4, q2 \n"
|
|
" vstrh.16 q4, [%[ptTarget]], #16 \n"
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
:[ptTarget] "+r"(ptTargetCur),[ptSrc] "+r"(ptSrc),
|
|
[pchTargetMask] "+l"(pchTargetMaskCur),[pchSrcMsk] "+l"(pchSrcMsk),
|
|
[loopCnt] "+r"(blkCnt)
|
|
:[scratch] "r" (scratch),[eight] "r"(8),[inv_2pow3] "r"(inv_2pow3),
|
|
[one] "r" (1)
|
|
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory");
|
|
|
|
#endif
|
|
ptTarget += wLength;
|
|
pchTargetMask += wLength;
|
|
|
|
wLengthLeft -= wLength;
|
|
} while (wLengthLeft);
|
|
|
|
ptSource += iSourceStride;
|
|
ptTargetBase += iTargetStride;
|
|
#if __API_CAFWM_CFG_SUPPORT_SRC_MSK_WRAPING
|
|
iSourceMaskY++;
|
|
//! handle source mask
|
|
if ( (iSourceMaskY >= ptSourceMaskSize->iHeight)
|
|
|| (iSourceMaskY >= ptSourceSize->iHeight)) {
|
|
pchSourceMask = pchSourceMaskBase;
|
|
iSourceMaskY = 0;
|
|
} else {
|
|
pchSourceMask += iSourceMaskStride;
|
|
}
|
|
#else
|
|
pchSourceMask += iSourceMaskStride;
|
|
#endif
|
|
|
|
pchTargetMaskLineBase = pchTargetMaskBase;
|
|
|
|
iTargetY++;
|
|
if (iTargetY >= ptTargetSize->iHeight) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
#if defined(__clang__)
|
|
# pragma clang diagnostic pop
|
|
#endif
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif // __ARM_2D_HAS_HELIUM__
|
|
|