3933 lines
170 KiB
C

/*
* Copyright (C) 2010-2022 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: Arm-2D Library
* Title: arm-2d_helium.c
* Description: Acceleration extensions using Helium.
*
* $Date: 03. Aug 2022
* $Revision: V.0.13.6
*
* Target Processor: Cortex-M cores with Helium
*
* -------------------------------------------------------------------- */
#define __ARM_2D_IMPL__
#include "arm_2d.h"
#include "__arm_2d_impl.h"
#if defined(__ARM_2D_HAS_HELIUM__) && __ARM_2D_HAS_HELIUM__
#if defined(__clang__)
# pragma clang diagnostic ignored "-Wunknown-warning-option"
# pragma clang diagnostic ignored "-Wreserved-identifier"
# pragma clang diagnostic ignored "-Wincompatible-pointer-types-discards-qualifiers"
# pragma clang diagnostic ignored "-Wcast-qual"
# pragma clang diagnostic ignored "-Wcast-align"
# pragma clang diagnostic ignored "-Wextra-semi-stmt"
# pragma clang diagnostic ignored "-Wsign-conversion"
# pragma clang diagnostic ignored "-Wunused-function"
# pragma clang diagnostic ignored "-Wimplicit-int-float-conversion"
# pragma clang diagnostic ignored "-Wdouble-promotion"
# pragma clang diagnostic ignored "-Wunused-parameter"
# pragma clang diagnostic ignored "-Wimplicit-float-conversion"
# pragma clang diagnostic ignored "-Wimplicit-int-conversion"
# pragma clang diagnostic ignored "-Wtautological-pointer-compare"
# pragma clang diagnostic ignored "-Wmissing-prototypes"
# pragma clang diagnostic ignored "-Wsign-compare"
# pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
# pragma clang diagnostic ignored "-Wpadded"
# pragma clang diagnostic ignored "-Wvector-conversion"
# pragma clang diagnostic ignored "-Wundef"
# pragma clang diagnostic ignored "-Wdeclaration-after-statement"
#endif
/*============================ INCLUDES ======================================*/
#include "__arm_2d_paving_helium.h"
#include "__arm_2d_math_helium.h"
#include "__arm_2d_utils_helium.h"
#ifdef __cplusplus
extern "C" {
#endif
/*============================ MACROS ========================================*/
/*============================ MACROFIED FUNCTIONS ===========================*/
/*============================ TYPES =========================================*/
/*============================ GLOBAL VARIABLES ==============================*/
/*============================ PROTOTYPES ====================================*/
/*============================ LOCAL VARIABLES ===============================*/
/*============================ IMPLEMENTATION ================================*/
/*!
* \brief initialise the helium acceleration
*/
void __arm_2d_helium_init(void)
{
/* even if this is empty, do not remove it */
}
/*----------------------------------------------------------------------------*
* Code Template for tile operations *
*----------------------------------------------------------------------------*/
#define __API_COLOUR c8bit
#define __API_INT_TYPE uint8_t
#define __API_INT_TYPE_BIT_NUM 8
#include "__arm_2d_copy_helium.inc"
#define __API_COLOUR rgb16
#define __API_INT_TYPE uint16_t
#define __API_INT_TYPE_BIT_NUM 16
#include "__arm_2d_copy_helium.inc"
#define __API_COLOUR rgb32
#define __API_INT_TYPE uint32_t
#define __API_INT_TYPE_BIT_NUM 32
#include "__arm_2d_copy_helium.inc"
#define __ARM_2D_COMPILATION_UNIT
#include "__arm_2d_transform_helium.c"
#define __ARM_2D_COMPILATION_UNIT
#include "__arm_2d_conversion_helium.c"
/*----------------------------------------------------------------------------*
* Helper
*----------------------------------------------------------------------------*/
__OVERRIDE_WEAK
void __MVE_WRAPPER(arm_2d_helper_swap_rgb16)(uint16_t *phwBuffer, uint32_t wCount)
{
if (0 == wCount) {
return ;
}
// aligned (2)
assert((((uintptr_t) phwBuffer) & 0x01) == 0);
// src not aligned to 32-bit
// (helium supports unaligned vector load & store but with extra cycle penalty)
if ((((uintptr_t) phwBuffer) & 0x03) == 0x02) {
// handle the leading pixel
uint32_t wTemp = *phwBuffer;
*phwBuffer++ = (uint16_t)__REV16(wTemp);
wCount--;
}
#ifdef USE_MVE_INTRINSICS
do {
mve_pred16_t tailPred = vctp16q(wCount);
uint16x8_t rgb16vec = vld1q_z(phwBuffer, tailPred);
rgb16vec = (uint16x8_t)vrev16q_m_u8(rgb16vec, rgb16vec, tailPred);
vst1q_p(phwBuffer , rgb16vec , tailPred);
phwBuffer += 8;
wCount -= 8;
}
while ((int32_t)wCount > 0);
#else
__asm volatile(
".p2align 2 \n"
" wlstp.16 lr, %[wCount], 1f \n"
"2: \n"
" vldrh.u16 q0, [%[phwBuffer]] \n"
" vrev16.8 q0, q0 \n"
" vstrh.u16 q0, [%[phwBuffer]], #16 \n"
" letp lr, 2b \n"
"1: \n"
: [phwBuffer] "+r"(phwBuffer)
: [wCount] "r" (wCount)
:"q0", "lr", "memory");
#endif
}
/*----------------------------------------------------------------------------*
* Specialized Copy Routines *
*----------------------------------------------------------------------------*/
static
void __arm_copy_16_mve_narrow( uint16_t *phwSource,
int16_t iSourceStride,
uint16_t *phwTarget,
int16_t iTargetStride,
arm_2d_size_t *ptCopySize)
{
#ifdef USE_MVE_INTRINSICS
for (int32_t x = 0; x < ptCopySize->iWidth; x++) {
uint16x8_t srcStr = vidupq_u16((uint32_t) 0, 1);
uint16x8_t dstStr = vidupq_u16((uint32_t) 0, 1);
srcStr = srcStr * iSourceStride;
dstStr = dstStr * iTargetStride;
for (int32_t y = 0; y < ptCopySize->iHeight / 8; y++) {
uint16x8_t in = vldrhq_gather_shifted_offset_u16(phwSource, srcStr);
srcStr = vaddq_n_u16(srcStr, (8 * iSourceStride));
vstrhq_scatter_shifted_offset_u16(phwTarget, dstStr, in);
dstStr = vaddq_n_u16(dstStr, (8 * iTargetStride));
}
phwSource++;
phwTarget++;
}
#else
__asm volatile(
" clrm {r2, r4} \n"
" vidup.u16 q0, r2, #1 \n"
" vmul.i16 q2, q0, %[iSourceStride] \n"
" vidup.u16 q1, r4, #1 \n"
" vmul.i16 q3, q1, %[iTargetStride] \n"
"3: \n"
/* outer loop, iterates over columns */
/* size = ptCopySize->iWidth */
" vmov q0, q2 \n"
" vmov q1, q3 \n"
/* inner loop, iterates over rows (size = ptCopySize->iHeight) */
" wlstp.16 lr, %[iHeight], 1f \n"
".p2align 2 \n"
"2: \n"
" vldrh.u16 q4, [%[phwSource], q0, uxtw #1] \n"
" vadd.i16 q0, q0, %[iSourceStridex8] \n"
" vstrh.16 q4, [%[phwTarget], q1, uxtw #1] \n"
" vadd.i16 q1, q1, %[iTargetStridex8] \n"
" letp lr, 2b \n"
"1: \n"
" add.n %[phwSource], #2 \n"
" add.n %[phwTarget], #2 \n"
" subs %[iWidth], #1 \n"
" bne 3b \n"
: [phwTarget] "+r"(phwTarget), [phwSource] "+r"(phwSource)
: [iHeight] "r"(ptCopySize->iHeight), [iWidth] "r" (ptCopySize->iWidth),
[iSourceStride] "r" (iSourceStride),[iSourceStridex8] "r" (iSourceStride*8),
[iTargetStride] "r" (iTargetStride),[iTargetStridex8] "r" (iTargetStride*8)
: "r2", "r4", "q0", "q1", "q2", "q3", "q4", "memory", "r14", "cc");
#endif
}
static
void __arm_copy_32_mve_narrow( uint32_t *pwSource,
int32_t iSourceStride,
uint32_t *pwTarget,
int32_t iTargetStride,
arm_2d_size_t *ptCopySize)
{
#ifdef USE_MVE_INTRINSICS
for (int_fast32_t x = 0; x < ptCopySize->iWidth; x++) {
uint32x4_t srcStr = vidupq_u32((uint32_t) 0, 1);
uint32x4_t dstStr = vidupq_u32((uint32_t) 0, 1);
srcStr = srcStr * iSourceStride;
dstStr = dstStr * iTargetStride;
for (int_fast32_t y = 0; y < ptCopySize->iHeight / 4; y++) {
uint32x4_t in = vldrwq_gather_shifted_offset_u32(pwSource, srcStr);
srcStr = vaddq_n_u32(srcStr, (4 * iSourceStride));
vstrwq_scatter_shifted_offset_u32(pwTarget, dstStr, in);
dstStr = vaddq_n_u32(dstStr, (4 * iTargetStride));
}
pwSource++;
pwTarget++;
}
#else
__asm volatile(
" clrm {r2, r4} \n"
" vidup.u32 q0, r2, #1 \n"
" vmul.i32 q2, q0, %[iSourceStride] \n"
" vidup.u32 q1, r4, #1 \n"
" vmul.i32 q3, q1, %[iTargetStride] \n"
"3: \n"
/* outer loop, iterates over columns */
/* size = ptCopySize->iWidth */
" vmov q0, q2 \n"
" vmov q1, q3 \n"
/* inner loop, iterates over rows (size = ptCopySize->iHeight) */
" wlstp.32 lr, %[iHeight], 1f \n"
".p2align 2 \n"
"2: \n"
" vldrw.u32 q4, [%[pwSource], q0, uxtw #2] \n"
" vadd.i32 q0, q0, %[iSourceStridex4] \n"
" vstrw.32 q4, [%[pwTarget], q1, uxtw #2] \n"
" vadd.i32 q1, q1, %[iTargetStridex4] \n"
" letp lr, 2b \n"
"1: \n"
" add.n %[pwSource], #4 \n"
" add.n %[pwTarget], #4 \n"
" subs %[iWidth], #1 \n"
" bne 3b \n"
: [pwTarget] "+r"(pwTarget), [pwSource] "+r"(pwSource)
: [iHeight] "r"(ptCopySize->iHeight), [iWidth] "r" (ptCopySize->iWidth),
[iSourceStride] "r" (iSourceStride),[iSourceStridex4] "r" (iSourceStride*4),
[iTargetStride] "r" (iTargetStride),[iTargetStridex4] "r" (iTargetStride*4)
: "r2", "r4", "q0", "q1", "q2", "q3", "q4", "memory", "r14", "cc");
#endif
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_rgb16_copy)( uint16_t *phwSource,
int16_t iSourceStride,
uint16_t *phwTarget,
int16_t iTargetStride,
arm_2d_size_t *ptCopySize)
{
/*
* 16-bit Narrow copy case:
* use column copy with scatter / gather
*/
if(ptCopySize->iWidth <= 4) {
__arm_copy_16_mve_narrow(phwSource,
iSourceStride,
phwTarget,
iTargetStride,
ptCopySize);
} else if((((uint32_t)phwSource & 3) == 0) && (((uint32_t)phwTarget & 3) == 0)
&& ((iSourceStride & 3) == 0) && ((iTargetStride & 3) ==0)) {
/*
* source / dst & strides are 64-bit aligned
* use scalar LDRD/STRD, faster than back to back vector VLDR/VSTR on M55
*/
__asm volatile(
"3: \n"
" mov r0, %[phwSource] \n"
" mov r1, %[phwTarget] \n"
/* scalar version faster (no DTCM bank conflict)*/
" wls lr, %[iWidth], 1f \n"
".p2align 2 \n"
"2: \n"
" ldrd r2, r3, [r0], #8 \n"
" strd r2, r3, [r1], #8 \n"
" le lr, 2b \n"
"1: \n"
// tail
" wls lr, %[iWidthTail], 1f \n"
".p2align 2 \n"
"2: \n"
" ldrh r2, [r0], #2 \n"
" strh r2, [r1], #2 \n"
" le lr, 2b \n"
"1: \n"
" add %[phwSource], %[iSourceStride] \n"
" add %[phwTarget], %[iTargetStride] \n"
" subs %[iHeight], #1 \n"
" bne 3b \n"
: [phwTarget] "+r"(phwTarget), [phwSource] "+r"(phwSource)
: [iHeight] "r"(ptCopySize->iHeight), [iWidth] "r" (ptCopySize->iWidth/4),
[iWidthTail] "r" (ptCopySize->iWidth & 3),
[iSourceStride] "r" (iSourceStride*sizeof(uint16_t)),
[iTargetStride] "r" (iTargetStride*sizeof(uint16_t))
: "r0", "r1", "r2", "r3", "q0", "memory", "r14", "cc"
);
}
else
{
/*
* generic column major 16-bit 2D copy
*/
int32_t iWidth = ptCopySize->iWidth;
int32_t iHeight = ptCopySize->iHeight;
__asm volatile(
" mov r2, %[iHeight] \n"
"3: \n"
" mov r0, %[phwSource] \n"
" mov r1, %[phwTarget] \n"
" wlstp.16 lr, %[iWidth], 1f \n"
".p2align 2 \n"
"2: \n"
" vldrh.u16 q0, [r0], #16 \n"
" vstrh.16 q0, [r1], #16 \n"
" letp lr, 2b \n"
"1: \n"
" add %[phwSource], %[iSourceStride] \n"
" add %[phwTarget], %[iTargetStride] \n"
" subs r2, #1 \n"
" bne 3b \n"
: [phwTarget] "+r"(phwTarget), [phwSource] "+r"(phwSource)
: [iHeight] "r"(iHeight), [iWidth] "r" (iWidth),
[iSourceStride] "r" (iSourceStride*sizeof(uint16_t)),
[iTargetStride] "r" (iTargetStride*sizeof(uint16_t))
: "r0", "r1", "r2", "q0", "memory", "r14", "cc");
}
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_rgb32_copy)( uint32_t *pwSource,
int16_t iSourceStride,
uint32_t *pwTarget,
int16_t iTargetStride,
arm_2d_size_t *ptCopySize)
{
if(ptCopySize->iWidth <= 2) {
/*
* 32-bit Narrow copy case:
* use column copy with scatter / gather
*/
__arm_copy_32_mve_narrow(pwSource,
iSourceStride,
pwTarget,
iTargetStride,
ptCopySize);
} else if((((uint32_t)pwSource & 3) == 0) && (((uint32_t)pwTarget & 3) == 0)
&& ((iSourceStride & 3) == 0) && ((iTargetStride & 3) ==0)) {
/*
* source / dst & strides are 64-bit aligned
* use scalar LDRD/STRD, faster than back to back vector VLDR/VSTR on M55
*/
__asm volatile(
"3: \n"
" mov r0, %[pwSource] \n"
" mov r1, %[pwTarget] \n"
/* scalar version faster (no DTCM bank conflict)*/
" wls lr, %[iWidth], 1f \n"
".p2align 2 \n"
"2: \n"
" ldrd r2, r3, [r0], #8 \n"
" strd r2, r3, [r1], #8 \n"
" le lr, 2b \n"
"1: \n"
// tail
" wls lr, %[iWidthTail], 1f \n"
".p2align 2 \n"
"2: \n"
" ldr r2, [r0], #4 \n"
" str r2, [r1], #4 \n"
" le lr, 2b \n"
"1: \n"
" add %[pwSource], %[iSourceStride] \n"
" add %[pwTarget], %[iTargetStride] \n"
" subs %[iHeight], #1 \n"
" bne 3b \n"
: [pwTarget] "+r"(pwTarget), [pwSource] "+r"(pwSource)
: [iHeight] "r"(ptCopySize->iHeight), [iWidth] "r" (ptCopySize->iWidth/2),
[iWidthTail] "r" (ptCopySize->iWidth & 1),
[iSourceStride] "r" (iSourceStride*sizeof(uint32_t)),
[iTargetStride] "r" (iTargetStride*sizeof(uint32_t))
: "r0", "r1", "r2", "r3", "q0", "memory", "r14", "cc"
);
}
else
{
/*
* generic column major 32-bit 2D copy
*/
__asm volatile(
"3: \n"
" mov r0, %[pwSource] \n"
" mov r1, %[pwTarget] \n"
" wlstp.32 lr, %[iWidth], 1f \n"
".p2align 2 \n"
"2: \n"
" vldrw.32 q0, [r0], #16 \n"
" vstrw.32 q0, [r1], #16 \n"
" letp lr, 2b \n"
"1: \n"
" add %[pwSource], %[iSourceStride] \n"
" add %[pwTarget], %[iTargetStride] \n"
" subs %[iHeight], #1 \n"
" bne 3b \n"
: [pwTarget] "+r"(pwTarget), [pwSource] "+r"(pwSource)
: [iHeight] "r"(ptCopySize->iHeight), [iWidth] "r" (ptCopySize->iWidth),
[iSourceStride] "r" (iSourceStride*sizeof(uint32_t)),
[iTargetStride] "r" (iTargetStride*sizeof(uint32_t))
: "r0", "r1", "q0", "memory", "r14", "cc");
}
}
/*----------------------------------------------------------------------------*
* alpha blending *
*----------------------------------------------------------------------------*/
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_gray8_alpha_blending)(uint8_t * __RESTRICT pSourceBase,
int16_t iSourceStride,
uint8_t * __RESTRICT pTargetBase,
int16_t iTargetStride,
arm_2d_size_t * __RESTRICT ptCopySize,
uint_fast16_t hwRatio)
{
int_fast16_t iHeight = ptCopySize->iHeight;
int_fast16_t iWidth = ptCopySize->iWidth;
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
hwRatio += (hwRatio == 255);
#endif
uint16_t hwRatioCompl = 256 - hwRatio;
for (int_fast16_t y = 0; y < iHeight; y++) {
const uint8_t *pSource = pSourceBase;
uint8_t *pTarget = pTargetBase;
int32_t blkCnt = iWidth;
#ifdef USE_MVE_INTRINSICS
do {
mve_pred16_t tailPred = vctp16q(blkCnt);
uint16x8_t vecTgt = vldrbq_z_u16(pTarget, tailPred);
uint16x8_t vecSrc = vldrbq_z_u16(pSource, tailPred);
vecTgt = vmulq_x(vecTgt, hwRatioCompl, tailPred);
vecTgt = vmlaq_m(vecTgt, vecSrc, hwRatio, tailPred);
vecTgt = vecTgt >> 8;
vstrbq_p_u16(pTarget , vecTgt , tailPred);
pSource += 8;
pTarget += 8;
blkCnt -= 8;
}
while (blkCnt > 0);
#else
__asm volatile(
" vldrb.u16 q0, [%[pTarget]] \n"
".p2align 2 \n"
" wls lr, %[loopCnt], 1f \n"
"2: \n"
" vmul.u16 q0, q0, %[hwRatioCompl] \n"
" vldrb.u16 q1, [%[pSource]], #8 \n"
" vmla.s16 q0, q1, %[hwRatio] \n"
" vldrb.u16 q2, [%[pTarget], #8] \n"
" vshr.u16 q0, q0, #8 \n"
" vstrb.u16 q0, [%[pTarget]], #8 \n"
" vmul.u16 q2, q2, %[hwRatioCompl] \n"
" vldrb.u16 q1, [%[pSource]], #8 \n"
" vmla.s16 q2, q1, %[hwRatio] \n"
" vldrb.u16 q0, [%[pTarget], #8] \n"
" vshr.u16 q2, q2, #8 \n"
" vstrb.u16 q2, [%[pTarget]], #8 \n"
" le lr, 2b \n"
"1: \n"
/* tail */
" wlstp.16 lr, %[tail], 1f \n"
"2: \n"
" vmul.u16 q0, q0, %[hwRatioCompl] \n"
" vldrb.u16 q1, [%[pSource]], #8 \n"
" vmla.s16 q0, q1, %[hwRatio] \n"
" vshr.u16 q1, q0, #8 \n"
" vldrb.u16 q0, [%[pTarget], #8] \n"
" vstrb.u16 q1, [%[pTarget]], #8 \n"
" letp lr, 2b \n"
"1: \n"
: [pTarget] "+r"(pTarget), [pSource] "+r" (pSource)
: [hwRatio] "r" (hwRatio), [hwRatioCompl] "r" (hwRatioCompl),
[loopCnt] "r"(blkCnt/16), [tail] "r"(blkCnt & 0xf)
:"q0", "q1", "q2", "memory", "r14");
#endif
pSourceBase += (iSourceStride);
pTargetBase += (iTargetStride);
}
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_gray8_alpha_blending_colour_keying)(uint8_t * __RESTRICT pSourceBase,
int16_t iSourceStride,
uint8_t * __RESTRICT pTargetBase,
int16_t iTargetStride,
arm_2d_size_t *
__RESTRICT ptCopySize,
uint_fast16_t hwRatio,
uint8_t Colour)
{
int_fast16_t iHeight = ptCopySize->iHeight;
int_fast16_t iWidth = ptCopySize->iWidth;
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
hwRatio += (hwRatio == 255);
#endif
uint16_t hwRatioCompl = 256 - hwRatio;
for (int_fast16_t y = 0; y < iHeight; y++) {
const uint8_t *pSource = pSourceBase;
uint8_t *pTarget = pTargetBase;
int32_t blkCnt = iWidth;
#ifdef USE_MVE_INTRINSICS
do {
mve_pred16_t tailPred = vctp16q(blkCnt);
uint16x8_t vecTgt = vldrbq_z_u16(pTarget, tailPred);
uint16x8_t vecSrc = vldrbq_z_u16(pSource, tailPred);
vecTgt = vmulq_x(vecTgt, hwRatioCompl, tailPred);
vecTgt = vmlaq_m(vecTgt, vecSrc, hwRatio, tailPred);
vecTgt = vecTgt >> 8;
vstrbq_p_u16(pTarget , vecTgt ,
vcmpneq_m_n_u16(vecSrc, (uint16_t)Colour, tailPred));
pSource += 8;
pTarget += 8;
blkCnt -= 8;
}
while (blkCnt > 0);
#else
__asm volatile(
" vldrb.u16 q0, [%[pTarget]] \n"
".p2align 2 \n"
" wls lr, %[loopCnt], 1f \n"
"2: \n"
" vmul.u16 q0, q0, %[hwRatioCompl] \n"
" vldrb.u16 q1, [%[pSource]], #8 \n"
" vmla.s16 q0, q1, %[hwRatio] \n"
" vldrb.u16 q2, [%[pTarget], #8] \n"
" vshr.u16 q0, q0, #8 \n"
" vpt.u16 ne, q1, %[Colour] \n"
" vstrbt.u16 q0, [%[pTarget]], #8 \n"
" vmul.u16 q2, q2, %[hwRatioCompl] \n"
" vldrb.u16 q1, [%[pSource]], #8 \n"
" vmla.s16 q2, q1, %[hwRatio] \n"
" vldrb.u16 q0, [%[pTarget], #8] \n"
" vshr.u16 q2, q2, #8 \n"
" vpt.u16 ne, q1, %[Colour] \n"
" vstrbt.u16 q2, [%[pTarget]], #8 \n"
" le lr, 2b \n"
"1: \n"
/* tail */
" wlstp.16 lr, %[tail], 1f \n"
"2: \n"
" vmul.u16 q0, q0, %[hwRatioCompl] \n"
" vldrb.u16 q1, [%[pSource]], #8 \n"
" vmla.s16 q0, q1, %[hwRatio] \n"
" vshr.u16 q2, q0, #8 \n"
" vldrb.u16 q0, [%[pTarget], #8] \n"
" vpt.u16 ne, q1, %[Colour] \n"
" vstrbt.u16 q2, [%[pTarget]], #8 \n"
" letp lr, 2b \n"
"1: \n"
: [pTarget] "+r"(pTarget), [pSource] "+r" (pSource)
: [hwRatio] "r" (hwRatio), [hwRatioCompl] "r" (hwRatioCompl),
[loopCnt] "r"(blkCnt/16), [tail] "r"(blkCnt & 0xf),
[Colour] "r" (Colour)
:"q0", "q1", "q2", "memory", "r14");
#endif
pSourceBase += (iSourceStride);
pTargetBase += (iTargetStride);
}
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_gray8_colour_filling_with_opacity)(uint8_t * __restrict pTargetBase,
int16_t iTargetStride,
arm_2d_size_t *
__restrict ptCopySize,
uint8_t Colour,
uint_fast16_t hwRatio)
{
int_fast16_t iHeight = ptCopySize->iHeight;
int_fast16_t iWidth = ptCopySize->iWidth;
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
hwRatio += (hwRatio == 255);
#endif
uint16_t hwRatioCompl = 256 - hwRatio;
uint16x8_t vecSrc = vdupq_n_u16(Colour);
for (int_fast16_t y = 0; y < iHeight; y++) {
uint8_t *pTarget = pTargetBase;
int32_t blkCnt = iWidth;
#ifdef USE_MVE_INTRINSICS
do {
mve_pred16_t tailPred = vctp16q(blkCnt);
uint16x8_t vecTgt = vldrbq_z_u16(pTarget, tailPred);
vecTgt = vmulq_x(vecTgt, hwRatioCompl, tailPred);
vecTgt = vmlaq_m(vecTgt, vecSrc, hwRatio, tailPred);
vecTgt = vecTgt >> 8;
vstrbq_p_u16(pTarget , vecTgt , tailPred);
pTarget += 8;
blkCnt -= 8;
}
while (blkCnt > 0);
#else
__asm volatile(
" vldrb.u16 q0, [%[pTarget]] \n"
" vmul.u16 q0, q0, %[hwRatioCompl] \n"
".p2align 2 \n"
" wls lr, %[loopCnt], 1f \n"
"2: \n"
" vmla.s16 q0, %[vecSrc], %[hwRatio] \n"
" vldrb.u16 q2, [%[pTarget], #8] \n"
" vshr.u16 q0, q0, #8 \n"
" vmul.u16 q2, q2, %[hwRatioCompl] \n"
" vstrb.u16 q0, [%[pTarget]], #8 \n"
" vmla.s16 q2, %[vecSrc], %[hwRatio] \n"
" vldrb.u16 q0, [%[pTarget], #8] \n"
" vshr.u16 q2, q2, #8 \n"
" vmul.u16 q0, q0, %[hwRatioCompl] \n"
" vstrb.u16 q2, [%[pTarget]], #8 \n"
" le lr, 2b \n"
"1: \n"
/* tail */
" wlstp.16 lr, %[tail], 1f \n"
"2: \n"
" vmla.s16 q0, %[vecSrc], %[hwRatio] \n"
" vshr.u16 q2, q0, #8 \n"
" vldrb.u16 q0, [%[pTarget], #8] \n"
" vmul.u16 q0, q0, %[hwRatioCompl] \n"
" vstrb.u16 q2, [%[pTarget]], #8 \n"
" letp lr, 2b \n"
"1: \n"
: [pTarget] "+r"(pTarget)
: [hwRatio] "r" (hwRatio), [hwRatioCompl] "r" (hwRatioCompl),
[loopCnt] "r"(blkCnt/16), [tail] "r"(blkCnt & 0xf),
[vecSrc] "t" (vecSrc)
:"q0", "q2", "memory", "r14");
#endif
pTargetBase += (iTargetStride);
}
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_rgb565_alpha_blending)( uint16_t *phwSourceBase,
int16_t iSourceStride,
uint16_t *phwTargetBase,
int16_t iTargetStride,
arm_2d_size_t *ptCopySize,
uint_fast16_t hwRatio)
{
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
hwRatio += (hwRatio == 255);
#endif
#ifdef USE_MVE_INTRINSICS
int32_t blkCnt;
uint16_t ratio1x8 = hwRatio * 8;
uint16_t ratio1x4 = hwRatio * 4;
uint16_t ratio2x8 = (256 - hwRatio) * 8;
uint16_t ratio2x4 = (256 - hwRatio) * 4;
uint16x8_t vecMaskR = vdupq_n_u16(0x001f);
uint16x8_t vecMaskG = vdupq_n_u16(0x003f);
uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8);
uint16x8_t vecMaskGpck = vdupq_n_u16(0x00fc);
for (int32_t y = 0; y < ptCopySize->iHeight; y++) {
const uint16_t *phwSource = phwSourceBase;
uint16_t *phwTarget = phwTargetBase;
blkCnt = ptCopySize->iWidth;
do {
uint16x8_t vecIn;
uint16x8_t vecR0, vecB0, vecG0;
uint16x8_t vecR1, vecB1, vecG1;
/* unpack 1st stream */
vecIn = vld1q(phwSource);
vecR0 = vecIn & vecMaskR;
vecB0 = vecIn >> 11;
vecG0 = vecIn >> 5;
vecG0 = vecG0 & vecMaskG;
/* unpack 2nd stream */
vecIn = vld1q(phwTarget);
vecR1 = vecIn & vecMaskR;
vecB1 = vecIn >> 11;
vecG1 = vecIn >> 5;
vecG1 = vecG1 & vecMaskG;
/* merge */
vecR0 = vecR0 * ratio1x8 + vecR1 * ratio2x8;
vecR0 = vecR0 >> 8;
vecG0 = vecG0 * ratio1x4 + vecG1 * ratio2x4;
vecG0 = vecG0 >> 8;
vecB0 = vecB0 * ratio1x8 + vecB1 * ratio2x8;
vecB0 = vecB0 >> 8;
/* pack */
uint16x8_t vOut = vecR0 >> 3 | vmulq((vecG0 & vecMaskGpck), 8)
| vmulq((vecB0 & vecMaskBpck), 256);
vst1q(phwTarget, vOut);
phwSource += 8;
phwTarget += 8;
blkCnt -= 8;
}
while (blkCnt > 0);
phwSourceBase += iSourceStride;
phwTargetBase += iTargetStride;
}
#else /* USE_MVE_INTRINSICS */
uint16_t ratio1x8 = hwRatio * 8;
uint16_t ratio1x4 = hwRatio * 4;
uint16_t ratio2x8 = (256 - hwRatio) * 8;
uint16_t ratio2x4 = (256 - hwRatio) * 4;
uint16x8_t vecMaskR = vdupq_n_u16(0x001f);
uint16x8_t vecMaskG = vdupq_n_u16(0x003f);
uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8);
uint32_t iWidth = ptCopySize->iWidth;
int32_t row = ptCopySize->iHeight;
uint16x8_t scratch[1];
vst1q((uint16_t *)scratch, vdupq_n_u16(0x00fc));
do {
const uint16_t *pSource = phwSourceBase;
uint16_t *pTarget = phwTargetBase;
register unsigned loopCnt __asm("lr");
loopCnt = iWidth;
__asm volatile(
".p2align 2 \n"
" vldrh.u16 q4, [%[pTarget]] \n"
" vldrh.u16 q5, [%[pSource]], #16 \n"
" wlstp.16 lr, %[loopCnt], 1f \n"
"2: \n"
// B target extraction
// right shift by 5 (x 1/32) for M55 friendly
// IV / Mul pipe interleaving
" vqdmulh.s16 q2, q4, %[rshft5] \n"
" vand q7, q4, %[vecMaskR] \n"
" vmul.i16 q6, q7, %[ratio2x8] \n"
// B source extraction
" vand q7, q5, %[vecMaskR] \n"
// B mix
" vmla.s16 q6, q7, %[ratio1x8] \n"
// G extraction
" vand q2, q2, %[vecMaskG] \n"
" vshr.u16 q7, q5, #5 \n"
" vmul.i16 q2, q2, %[ratio2x4] \n"
// G extraction
" vand q7, q7, %[vecMaskG] \n"
// G mix
" vmla.s16 q2, q7, %[ratio1x4] \n"
// R extraction
" vshr.u16 q4, q4, #11 \n"
" vmul.i16 q7, q4, %[ratio2x8] \n"
// R extraction
" vshr.u16 q5, q5, #11 \n"
// R mix
" vmla.s16 q7, q5, %[ratio1x8] \n"
" vshr.u16 q2, q2, #8 \n"
" vldrh.16 q5, [%[scratch]] \n"
" vand q2, q2, q5 \n"
// vmulq((vecG0 & 0x00fc), 8)
" vmul.i16 q2, q2, %[eight] \n"
" vshr.u16 q4, q7, #8 \n"
// schedule next source load
" vldrh.u16 q5, [%[pSource]], #16 \n"
" vand q7, q4, %[vecMaskBpck] \n"
// pack R & G
// vmulq((vecG0 & vecMaskGpck), 8) + vmulq((vecR0 & vecMaskRpck), 256)
" vmla.s16 q2, q7, %[twofiftysix] \n"
// downshift B ((vecB0 >> 8) >> 3)
" vshr.u16 q7, q6, #11 \n"
// schedule next target load (pre offset as target not imcrementred so far)
" vldrh.u16 q4, [%[pTarget], #16] \n"
// pack blue with R&G
" vorr q2, q2, q7 \n"
" vstrh.16 q2, [%[pTarget]], #16 \n"
" letp lr, 2b \n"
"1: \n"
: [pSource] "+r"(pSource), [pTarget] "+r" (pTarget), [loopCnt] "+r"(loopCnt)
: [vecMaskR] "t" (vecMaskR), [vecMaskG] "t" (vecMaskG),
[vecMaskBpck] "t" (vecMaskBpck),
[ratio1x8] "r" (ratio1x8), [ratio2x8] "r" (ratio2x8),
[ratio1x4] "r" (ratio1x4), [ratio2x4] "r" (ratio2x4),
[eight] "r" (8), [twofiftysix] "r" (256), [rshft5] "r" (1024), [scratch] "r" (scratch)
: "q2", "q4", "q5", "q6", "q7", "memory" );
phwSourceBase += iSourceStride;
phwTargetBase += iTargetStride;
} while (--row);
#endif /* USE_MVE_INTRINSICS */
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_rgb565_colour_filling_with_opacity)(
uint16_t *__RESTRICT pTargetBase,
int16_t iTargetStride,
arm_2d_size_t *__RESTRICT ptCopySize,
uint16_t Colour,
uint_fast16_t hwRatio)
{
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
hwRatio += (hwRatio == 255);
#endif
#ifdef USE_MVE_INTRINSICS
int32_t blkCnt;
uint16_t ratio1x8 = hwRatio * 8;
uint16_t ratio1x4 = hwRatio * 4;
uint16_t ratio2x8 = (256 - hwRatio) * 8;
uint16_t ratio2x4 = (256 - hwRatio) * 4;
uint16x8_t vecMaskR = vdupq_n_u16(0x001f);
uint16x8_t vecMaskG = vdupq_n_u16(0x003f);
uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8);
uint16x8_t vecMaskGpck = vdupq_n_u16(0x00fc);
uint16x8_t vecIn;
uint16x8_t vecColorR, vecColorB, vecColorG;
/* unpack color & scale */
vecIn = vdupq_n_u16(Colour);
vecColorR = (vecIn & vecMaskR) * ratio1x8;
vecColorB = (vecIn >> 11) * ratio1x8;
vecColorG = vecIn >> 5;
vecColorG = (vecColorG & vecMaskG) * ratio1x4;
for (int32_t y = 0; y < ptCopySize->iHeight; y++) {
uint16_t *phwTarget = pTargetBase;
blkCnt = ptCopySize->iWidth;
do {
uint16x8_t vecR0, vecB0, vecG0;
uint16x8_t vecR1, vecB1, vecG1;
/* unpack stream */
vecIn = vld1q(phwTarget);
vecR1 = vecIn & vecMaskR;
vecB1 = vecIn >> 11;
vecG1 = vecIn >> 5;
vecG1 = vecG1 & vecMaskG;
/* merge */
vecR0 = vecColorR + vecR1 * ratio2x8;
vecR0 = vecR0 >> 8;
vecG0 = vecColorG + vecG1 * ratio2x4;
vecG0 = vecG0 >> 8;
vecB0 = vecColorB + vecB1 * ratio2x8;
vecB0 = vecB0 >> 8;
/* pack */
uint16x8_t vOut = vecR0 >> 3 | vmulq((vecG0 & vecMaskGpck), 8)
| vmulq((vecB0 & vecMaskBpck), 256);
vst1q(phwTarget, vOut);
phwTarget += 8;
blkCnt -= 8;
}
while (blkCnt > 0);
pTargetBase += iTargetStride;
}
#else /* USE_MVE_INTRINSICS */
uint16_t ratio1x8 = hwRatio * 8;
uint16_t ratio1x4 = hwRatio * 4;
uint16_t ratio2x8 = (256 - hwRatio) * 8;
uint16_t ratio2x4 = (256 - hwRatio) * 4;
uint16x8_t vecMaskR = vdupq_n_u16(0x001f);
uint16x8_t vecMaskG = vdupq_n_u16(0x003f);
uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8);
uint16x8_t vecColorR, vecColorB, vecColorG;
uint16x8_t scratch[4];
/* unpack color */
uint16x8_t vecIn = vdupq_n_u16(Colour);
vecColorR = vecIn & vecMaskR;
vecColorB = vecIn >> 11;
vecColorG = vecIn >> 5;
vecColorG = vecColorG & vecMaskG;
vst1q((uint16_t*)scratch, vecColorR * ratio1x8);
vst1q((uint16_t*)&scratch[1], vecColorB * ratio1x8);
vst1q((uint16_t*)&scratch[2], vecColorG * ratio1x4);
vst1q((uint16_t*)&scratch[3], vdupq_n_u16(0x00fc));
int32_t row = ptCopySize->iHeight;
do {
uint16_t *phwTarget = pTargetBase;
register unsigned loopCnt __asm("lr");
loopCnt = ptCopySize->iWidth;
__asm volatile(
" vldrh.u16 q4, [%[phwTarget]] \n"
" wlstp.16 lr, %[loopCnt], 1f \n"
".p2align 2 \n"
"2: \n"
// B target extraction
" vand q7, q4, %[vecMaskR] \n"
" vldrh.u16 q6, [%[scratch]] \n"
" vshr.u16 q2, q4, #5 \n"
// B mix
" vmla.s16 q6, q7, %[ratio2x8] \n"
// G extraction
" vand q7, q2, %[vecMaskG] \n"
// G extraction
" vldrh.u16 q2, [%[scratch], #32] \n"
// G mix
" vmla.s16 q2, q7, %[ratio2x4] \n"
" vshr.u16 q4, q4, #11 \n"
// R extraction
" vldrh.u16 q7, [%[scratch], #16] \n"
" vshr.u16 q2, q2, #8 \n"
// R mix
" vmla.s16 q7, q4, %[ratio2x8] \n"
" vshr.u16 q4, q7, #8 \n"
// load duplicated 0xfc mask
" vldrh.u16 q7, [%[scratch], #48] \n"
" vand q2, q2, q7 \n"
" vmul.i16 q2, q2, %[eight] \n"
" vand q7, q4, %[vecMaskBpck] \n"
// pack R & G
" vmla.s16 q2, q7, %[twofiftysix] \n"
// downshift B ((vecB0 >> 8) >> 3)
" vshr.u16 q7, q6, #11 \n"
// schedule next target load
" vldrh.u16 q4, [%[phwTarget], #16] \n"
// pack blue with R&G
" vorr q2, q2, q7 \n"
" vstrh.16 q2, [%[phwTarget]], #16 \n"
" letp lr, 2b \n"
"1: \n"
: [phwTarget] "+r" (phwTarget), [loopCnt] "+r"(loopCnt)
: [vecMaskR] "t" (vecMaskR), [vecMaskG] "t" (vecMaskG),
[vecMaskBpck] "t" (vecMaskBpck),
[ratio2x8] "r" (ratio2x8), [ratio2x4] "r" (ratio2x4),
[eight] "r" (8), [twofiftysix] "r" (256), [scratch] "r" (scratch)
: "q2", "q4", "q5", "q6", "q7", "memory" );
pTargetBase += iTargetStride;
} while (--row);
#endif /* USE_MVE_INTRINSICS */
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_rgb565_alpha_blending_colour_keying)(
uint16_t * __RESTRICT phwSource,
int16_t iSourceStride,
uint16_t * __RESTRICT phwTarget,
int16_t iTargetStride,
arm_2d_size_t * __RESTRICT ptCopySize,
uint_fast16_t hwRatio,
uint16_t hwColour)
{
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
hwRatio += (hwRatio == 255);
#endif
#ifdef USE_MVE_INTRINSICS
uint32_t iHeight = ptCopySize->iHeight;
uint32_t iWidth = ptCopySize->iWidth;
int32_t blkCnt;
uint16_t ratio1x8 = hwRatio * 8;
uint16_t ratio1x4 = hwRatio * 4;
uint16_t ratio2x8 = (256 - hwRatio) * 8;
uint16_t ratio2x4 = (256 - hwRatio) * 4;
uint16x8_t vecMaskR = vdupq_n_u16(0x001f);
uint16x8_t vecMaskG = vdupq_n_u16(0x003f);
uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8);
uint16x8_t vecMaskGpck = vdupq_n_u16(0x00fc);
for (uint32_t y = 0; y < iHeight; y++) {
// - inconditional blending + predicated dst update
const uint16_t *pSource = phwSource;
uint16_t *pTarget = phwTarget;
blkCnt = iWidth >> 3;
while (blkCnt > 0) {
uint16x8_t vecInSrc, vecInDst;
uint16x8_t vecR0, vecB0, vecG0;
uint16x8_t vecR1, vecB1, vecG1;
/* unpack 1st stream */
vecInSrc = vld1q(pSource);
vecR0 = vandq(vecInSrc, vecMaskR);
vecB0 = vshrq(vecInSrc, 11);
vecG0 = vshrq(vecInSrc, 5);
vecG0 = vandq(vecG0, vecMaskG);
/* unpack 2nd stream */
vecInDst = vld1q(pTarget);
vecR1 = vandq(vecInDst, vecMaskR);
vecB1 = vshrq(vecInDst, 11);
vecG1 = vshrq(vecInDst, 5);
vecG1 = vandq(vecG1, vecMaskG);
/* merge */
vecR0 = vmlaq(vmulq(vecR0, ratio1x8), vecR1, ratio2x8);
vecR0 = vshrq(vecR0, 8);
vecG0 = vmlaq(vmulq(vecG0, ratio1x4), vecG1, ratio2x4);
vecG0 = vshrq(vecG0, 8);
vecB0 = vmlaq(vmulq(vecB0, ratio1x8), vecB1, ratio2x8);
vecB0 = vshrq(vecB0, 8);
/* pack */
uint16x8_t vOut = vorrq(vshrq(vecR0, 3),
vmulq(vandq(vecG0, vecMaskGpck), 8));
vOut = vorrq(vOut, vmulq(vandq(vecB0, vecMaskBpck), 256));
vst1q_p(pTarget, vOut, vcmpneq_n_s16(vecInSrc, hwColour));
pSource += 8;
pTarget += 8;
blkCnt--;
}
blkCnt = iWidth & 7;
if (blkCnt > 0U) {
uint16x8_t vecInSrc, vecInDst;
uint16x8_t vecR0, vecB0, vecG0;
uint16x8_t vecR1, vecB1, vecG1;
/* unpack 1st stream */
vecInSrc = vld1q(pSource);
vecR0 = vandq(vecInSrc, vecMaskR);
vecB0 = vshrq(vecInSrc, 11);
vecG0 = vshrq(vecInSrc, 5);
vecG0 = vandq(vecG0, vecMaskG);
/* unpack 2nd stream */
vecInDst = vld1q(pTarget);
vecR1 = vandq(vecInDst, vecMaskR);
vecB1 = vshrq(vecInDst, 11);
vecG1 = vshrq(vecInDst, 5);
vecG1 = vandq(vecG1, vecMaskG);
/* merge */
vecR0 = vmlaq(vmulq(vecR0, ratio1x8), vecR1, ratio2x8);
vecR0 = vshrq(vecR0, 8);
vecG0 = vmlaq(vmulq(vecG0, ratio1x4), vecG1, ratio2x4);
vecG0 = vshrq(vecG0, 8);
vecB0 = vmlaq(vmulq(vecB0, ratio1x8), vecB1, ratio2x8);
vecB0 = vshrq(vecB0, 8);
/* pack */
uint16x8_t vOut = vorrq(vshrq(vecR0, 3),
vmulq(vandq(vecG0, vecMaskGpck), 8));
vOut = vorrq(vOut,
vmulq(vandq(vecB0, vecMaskBpck), 256));
vst1q_p(pTarget, vOut,
vcmpneq_m_n_s16(vecInSrc, hwColour, vctp16q(blkCnt)));
}
phwSource += iSourceStride;
phwTarget += iTargetStride;
}
#else
uint32_t iHeight = ptCopySize->iHeight;
uint32_t iWidth = ptCopySize->iWidth;
uint16_t ratio1x8 = hwRatio * 8;
uint16_t ratio1x4 = hwRatio * 4;
uint16_t ratio2x8 = (256 - hwRatio) * 8;
uint16_t ratio2x4 = (256 - hwRatio) * 4;
uint16x8_t vecMaskR = vdupq_n_u16(0x001f);
uint16x8_t vecMaskG = vdupq_n_u16(0x003f);
uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8);
uint16x8_t scratch[1];
vst1q((uint16_t *)scratch, vdupq_n_u16(0x00fc));
for (uint32_t y = 0; y < iHeight; y++) {
const uint16_t *pSource = phwSource;
uint16_t *pTarget = phwTarget;
register unsigned loopCnt __asm("lr");
loopCnt = iWidth;
__asm volatile(
".p2align 2 \n"
" vldrh.u16 q4, [%[pTarget]] \n"
" vldrh.u16 q5, [%[pSource]], #16 \n"
" vand q7, q4, %[vecMaskR] \n"
" wlstp.16 lr, %[loopCnt], 1f \n"
"2: \n"
// B target extraction
" vshr.u16 q2, q4, #5 \n"
" vmul.i16 q6, q7, %[ratio2x8] \n"
// B source extraction
" vand q7, q5, %[vecMaskR] \n"
// B mix
" vmla.s16 q6, q7, %[ratio1x8] \n"
// G extraction
" vand q2, q2, %[vecMaskG] \n"
" vshr.u16 q7, q5, #5 \n"
" vmul.i16 q2, q2, %[ratio2x4] \n"
// G extraction
" vand q7, q7, %[vecMaskG] \n"
// G mix
" vmla.s16 q2, q7, %[ratio1x4] \n"
// R extraction
" vshr.u16 q4, q4, #11 \n"
" vmul.i16 q7, q4, %[ratio2x8] \n"
// R extraction
" vshr.u16 q5, q5, #11 \n"
// R mix
" vmla.s16 q7, q5, %[ratio1x8] \n"
" vshr.u16 q2, q2, #8 \n"
" vldrh.16 q5, [%[scratch]] \n"
" vand q2, q2, q5 \n"
// vmulq((vecG0 & 0x00fc), 8)
" vmul.i16 q2, q2, %[eight] \n"
" vshr.u16 q4, q7, #8 \n"
// schedule next source load
" vldrh.u16 q5, [%[pSource]], #16 \n"
" vand q7, q4, %[vecMaskBpck] \n"
// pack R & G
// vmulq((vecG0 & vecMaskGpck), 8) + vmulq((vecR0 & vecMaskRpck), 256)
" vmla.s16 q2, q7, %[twofiftysix] \n"
// downshift B ((vecB0 >> 8) >> 3)
" vshr.u16 q7, q6, #11 \n"
// schedule next target load (pre offset as target not imcrementred so far)
" vldrh.u16 q4, [%[pTarget], #16] \n"
// pack blue with R&G
" vorr q2, q2, q7 \n"
" vldrh.u16 q6, [%[pSource], #-32] \n"
" vand q7, q4, %[vecMaskR] \n"
" vpt.u16 ne, q6, %[hwColour] \n"
" vstrht.16 q2, [%[pTarget]], #16 \n"
" letp lr, 2b \n"
"1: \n"
: [pSource] "+r"(pSource), [pTarget] "+r" (pTarget), [loopCnt] "+r"(loopCnt)
: [vecMaskR] "t" (vecMaskR), [vecMaskG] "t" (vecMaskG),
[vecMaskBpck] "t" (vecMaskBpck),
[ratio1x8] "r" (ratio1x8), [ratio2x8] "r" (ratio2x8),
[ratio1x4] "r" (ratio1x4), [ratio2x4] "r" (ratio2x4),
[eight] "r" (8), [twofiftysix] "r" (256), [hwColour] "r" (hwColour), [scratch] "r" (scratch)
: "q2", "q4", "q5", "q6", "q7", "memory" );
phwSource += (iSourceStride);
phwTarget += (iTargetStride);
}
#endif
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_cccn888_alpha_blending)( uint32_t *pwSourceBase,
int16_t iSourceStride,
uint32_t *pwTargetBase,
int16_t iTargetStride,
arm_2d_size_t *ptCopySize,
uint_fast16_t hwRatio)
{
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
hwRatio += (hwRatio == 255);
#endif
uint16_t hwRatioCompl = 256 - (uint16_t) hwRatio;
#ifdef USE_MVE_INTRINSICS
int32_t blkCnt;
int32_t row = ptCopySize->iHeight;
while (row > 0) {
const uint32_t *pwSource = pwSourceBase;
uint32_t *pwTarget = pwTargetBase;
/* byte extraction into 16-bit vector */
uint16x8_t vecSrc = vldrbq_u16((const uint8_t *)pwSource);
uint16x8_t vecTrg = vldrbq_u16((const uint8_t *)pwTarget);
pwSource += 2;
blkCnt = ptCopySize->iWidth;
while (blkCnt > 0) {
vstrbq_u16((const uint8_t *)pwTarget,
vmlaq(vmulq(vecSrc, hwRatio), vecTrg, hwRatioCompl) >> 8);
pwTarget += 2;
vecSrc = vldrbq_u16((const uint8_t *)pwSource);
vecTrg = vldrbq_u16((const uint8_t *)pwTarget);
pwSource += 2;
blkCnt -= 2;
}
pwSourceBase += iSourceStride;
pwTargetBase += iTargetStride;
row--;
}
#else
register unsigned blkCnt __asm("lr");
int32_t row = ptCopySize->iHeight;
while(row > 0)
{
blkCnt = ptCopySize->iWidth*4;
const uint32_t *pwSource = pwSourceBase;
uint32_t *pwTarget = pwTargetBase;
__asm volatile(
" vldrb.u16 q0, [%[pwSource]], #8 \n"
" vldrb.u16 q1, [%[pwTarget]] \n"
" wlstp.16 lr, %[loopCnt], 1f \n"
"2: \n"
" vmul.u16 q2, q0, %[hwRatio] \n"
" vldrb.u16 q0, [%[pwSource]], #8 \n"
" vmla.s16 q2, q1, %[hwRatioCompl] \n"
" vldrb.u16 q1, [%[pwTarget], #8] \n"
" vshr.u16 q2, q2, #8 \n"
" vstrb.16 q2, [%[pwTarget]], #8 \n"
" letp lr, 2b \n"
"1: \n"
: [pwSource] "+l"(pwSource), [pwTarget] "+l"(pwTarget),
[loopCnt] "+r"(blkCnt)
: [hwRatio] "r" (hwRatio), [hwRatioCompl] "r" (hwRatioCompl)
: "q0", "q1", "q2", "memory" );
pwSourceBase += iSourceStride;
pwTargetBase += iTargetStride;
row--;
}
#endif
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_cccn888_colour_filling_with_opacity)(
uint32_t *__RESTRICT pTargetBase,
int16_t iTargetStride,
arm_2d_size_t *__RESTRICT ptCopySize,
uint32_t Colour,
uint_fast16_t hwRatio)
{
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
hwRatio += (hwRatio == 255);
#endif
uint16_t hwRatioCompl = 256 - (uint16_t) hwRatio;
#ifdef USE_MVE_INTRINSICS
int32_t blkCnt;
int32_t row = ptCopySize->iHeight;
uint32_t scratch[2];
uint16x8_t vColor;
scratch[0] = scratch[1] = Colour;
vColor = vldrbq_u16((uint8_t *) scratch);
vColor = vColor * (uint16_t)hwRatio;
while (row > 0) {
uint32_t *pTarget = pTargetBase;
blkCnt = ptCopySize->iWidth;
while (blkCnt > 0) {
/* byte extraction into 16-bit vector */
uint16x8_t vecTrg = vldrbq_u16((uint8_t *)pTarget);
vstrbq_u16((uint8_t *)pTarget, vmlaq(vColor, vecTrg, hwRatioCompl) >> 8);
pTarget += 2;
blkCnt -= 2;
}
pTargetBase += iTargetStride;
row--;
}
#else /* USE_MVE_INTRINSICS */
int32_t blkCnt;
int32_t row = ptCopySize->iHeight;
uint32_t scratch[2];
uint16x8_t vColor;
scratch[0] = scratch[1] = Colour;
vColor = vldrbq_u16((uint8_t *) scratch);
vColor = vColor * (uint16_t)hwRatio;
while (row > 0) {
uint32_t *pTarget = pTargetBase;
blkCnt = ptCopySize->iWidth*4;
__asm volatile(
/* preload */
" vldrb.u16 q1, [%[pTarget]] \n"
" wlstp.16 lr, %[loopCnt], 1f \n"
".p2align 2 \n"
"2: \n"
" vmov q2, %[vColor] \n"
" vmla.s16 q2, q1, %[hwRatioCompl] \n"
" vldrb.u16 q1, [%[pTarget], #8] \n"
" vshr.u16 q2, q2, #8 \n"
" vstrb.16 q2, [%[pTarget]], #8 \n"
" letp lr, 2b \n"
"1: \n"
: [pTarget] "+l"(pTarget)
: [loopCnt] "r"(blkCnt), [hwRatioCompl] "r" (hwRatioCompl), [vColor] "t" (vColor)
: "q0", "q1", "q2", "memory" );
pTargetBase += iTargetStride;
row--;
}
#endif /* USE_MVE_INTRINSICS */
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_cccn888_alpha_blending_colour_keying)(uint32_t * __RESTRICT pSourceBase,
int16_t iSourceStride,
uint32_t * __RESTRICT pTargetBase,
int16_t iTargetStride,
arm_2d_size_t *
__RESTRICT ptCopySize,
uint_fast16_t hwRatio,
uint32_t Colour)
{
int_fast16_t iHeight = ptCopySize->iHeight;
int_fast16_t iWidth = ptCopySize->iWidth;
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
hwRatio += (hwRatio == 255);
#endif
uint16_t hwRatioCompl = 256 - hwRatio;
for (int_fast16_t y = 0; y < iHeight; y++) {
const uint32_t *pSource = pSourceBase;
uint32_t *pTarget = pTargetBase;
#ifdef USE_MVE_INTRINSICS
int32_t blkCnt = iWidth;
do {
mve_pred16_t p = vctp32q(blkCnt);
uint8x16_t vSrc8 = vld1q_z(pSource, p);
uint8x16_t vTrg8 = vld1q_z(pTarget, p);
/* 16-bit expansion A/G src pixels */
uint16x8_t vSrc16b = vmovlbq_x(vSrc8, p);
/* 16-bit expansion R/B src pixels */
uint16x8_t vSrc16t = vmovltq_x(vSrc8, p);
/* 16-bit expansion A/G target pixels */
uint16x8_t vTrg16b = vmovlbq_x(vTrg8, p);
/* 16-bit expansion R/B target pixels */
uint16x8_t vTrg16t = vmovltq_x(vTrg8, p);
/* A/G blending */
int16x8_t vecOutb = vmlaq_m(vmulq_x(vSrc16b, hwRatio, p), vTrg16b, hwRatioCompl, p);
/* R/B blending */
int16x8_t vecOutt = vmlaq_m(vmulq_x(vSrc16t, hwRatio, p), vTrg16t, hwRatioCompl, p);
/* merge into 8-bit vector */
int8x16_t vecOut8 = vuninitializedq_s8();
vecOut8 = vqshrnbq_m_n_s16(vecOut8, vecOutb, 8, p);
vecOut8 = vqshrntq_m_n_s16(vecOut8, vecOutt, 8, p);
// update if (*pSourceBase != Colour)
vst1q_p_u32(pTarget, (uint32x4_t) vecOut8,
vcmpneq_m_n_u32((uint32x4_t) vSrc8, Colour, p));
pSource += 4;
pTarget += 4;
blkCnt -= 4;
}
while (blkCnt > 0);
#else // USE_MVE_INTRINSICS
__asm volatile (
".p2align 2 \n"
/* preload uint32x4_t target vector */
" vldrw.u32 q2, [%[targ]] \n"
" wlstp.32 lr, %[loopCnt], 1f \n"
"2: \n"
/* 16-bit expansion A/G target pixels */
" vmovlb.u8 q3, q2 \n"
" vldrw.u32 q0, [%[src]], #16 \n"
/* 16-bit expansion A/G source pixels */
" vmovlb.u8 q1, q0 \n"
" vmul.i16 q1, q1, %[ratio] \n"
/* 16-bit expansion R/B target pixels */
" vmovlt.u8 q2, q2 \n"
/* A/G blending */
" vmla.s16 q1, q3, %[ratioCmp] \n"
/* 16-bit expansion R/B source pixels */
" vmovlt.u8 q3, q0 \n"
" vmul.i16 q3, q3, %[ratio] \n"
/* merge A/G into 8-bit vector */
" vqshrnb.s16 q1, q1, #8 \n"
/* R/B blending */
" vmla.s16 q3, q2, %[ratioCmp] \n"
/* preload next target */
" vldrw.u32 q2, [%[targ], #16] \n"
/* merge R/B into 8-bit vector */
" vqshrnt.s16 q1, q3, #8 \n"
/* update if (*pSourceBase != Colour) */
" vpt.i32 ne, q0, %[color] \n"
" vstrwt.32 q1, [%[targ]], #16 \n"
" letp lr, 2b \n"
"1: \n"
:[targ] "+r" (pTarget), [src] "+r" (pSource)
:[loopCnt] "r" (iWidth), [ratio] "r" (hwRatio),
[ratioCmp] "r" (hwRatioCompl), [color] "r" (Colour)
:"r14", "q0", "q1", "q2", "q3", "memory");
#endif
pSourceBase += (iSourceStride);
pTargetBase += (iTargetStride);
}
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_rgb565_alpha_blending_direct)(const uint16_t *phwSource,
const uint16_t *phwBackground,
uint16_t *phwDestination,
uint32_t wPixelCount,
uint_fast16_t hwRatio)
{
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
hwRatio += (hwRatio == 255);
#endif
#ifdef USE_MVE_INTRINSICS
int32_t blkCnt;
uint16_t ratio1x8 = hwRatio * 8;
uint16_t ratio1x4 = hwRatio * 4;
uint16_t ratio2x8 = (256 - hwRatio) * 8;
uint16_t ratio2x4 = (256 - hwRatio) * 4;
uint16x8_t vecMaskR = vdupq_n_u16(0x001f);
uint16x8_t vecMaskG = vdupq_n_u16(0x003f);
uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8);
uint16x8_t vecMaskGpck = vdupq_n_u16(0x00fc);
blkCnt = wPixelCount;
do {
uint16x8_t vecIn;
uint16x8_t vecR0, vecB0, vecG0;
uint16x8_t vecR1, vecB1, vecG1;
/* unpack 1st stream */
vecIn = vld1q(phwSource);
phwSource += 8;
vecR0 = vecIn & vecMaskR;
vecB0 = vecIn >> 11;
vecG0 = vecIn >> 5;
vecG0 = vecG0 & vecMaskG;
/* unpack 2nd stream */
vecIn = vld1q(phwBackground);
phwBackground += 8;
vecR1 = vecIn & vecMaskR;
vecB1 = vecIn >> 11;
vecG1 = vecIn >> 5;
vecG1 = vecG1 & vecMaskG;
/* merge */
vecR0 = vecR0 * ratio1x8 + vecR1 * ratio2x8;
vecR0 = vecR0 >> 8;
vecG0 = vecG0 * ratio1x4 + vecG1 * ratio2x4;
vecG0 = vecG0 >> 8;
vecB0 = vecB0 * ratio1x8 + vecB1 * ratio2x8;
vecB0 = vecB0 >> 8;
/* pack */
uint16x8_t vOut =
vecR0 >> 3 | vmulq((vecG0 & vecMaskGpck), 8)
| vmulq((vecB0 & vecMaskBpck), 256);
vst1q(phwDestination, vOut);
phwDestination += 8;
blkCnt -= 8;
}
while (blkCnt > 0);
#else /* USE_MVE_INTRINSICS */
uint16_t ratio1x8 = hwRatio * 8;
uint16_t ratio1x4 = hwRatio * 4;
uint16_t ratio2x8 = (256 - hwRatio) * 8;
uint16_t ratio2x4 = (256 - hwRatio) * 4;
uint16x8_t vecMaskR = vdupq_n_u16(0x001f);
uint16x8_t vecMaskG = vdupq_n_u16(0x003f);
uint16x8_t vecMaskBpck = vdupq_n_u16(0x00f8);
register unsigned loopCnt __asm("lr") = (wPixelCount);
__asm volatile(
" vldrh.u16 q4, [%[in2]], #16 \n"
" vmov.i16 q6, #0x00fc \n"
" vstrw.32 q6, [sp] \n"
" vldrh.u16 q5, [%[in1]], #16 \n"
" wlstp.16 lr, %[loopCnt], 1f \n"
"2: \n"
" vand q6, q4, %[vecMaskR] \n"
" vmul.i16 q6, q6, %[ratio2x8] \n"
" vshr.u16 q2, q4, #5 \n"
" vand q7, q5, %[vecMaskR] \n"
" vmla.s16 q6, q7, %[ratio1x8] \n"
" vand q2, q2, %[vecMaskG] \n"
" vshr.u16 q7, q5, #5 \n"
" vmul.i16 q2, q2, %[ratio2x4] \n"
" vand q7, q7, %[vecMaskG] \n"
" vmla.s16 q2, q7, %[ratio1x4] \n"
" vshr.u16 q4, q4, #11 \n"
" vmul.i16 q7, q4, %[ratio2x8] \n"
" vshr.u16 q5, q5, #11 \n"
" vshr.u16 q2, q2, #8 \n"
" vmla.s16 q7, q5, %[ratio1x8] \n"
// " vmov.i16 q6, #0x00fc \n"
" vshr.u16 q7, q7, #8 \n"
// " vmov.i16 q6, #0x00fc \n"
/* load 0x00fc instead of mov for better overlap opportunity */
" vldrw.u32 q4, [sp] \n"
" vand q2, q2, q4 \n"
" vmul.i16 q2, q2, %[eight] \n"
" vand q4, q7, %[vecMaskBpck] \n" // Q7 = vecB0
" vldrh.u16 q5, [%[in1]], #16 \n"
" vmla.s16 q2, q4, %[twofiftysix] \n"
// (vecR0 >> 3) >> 8
" vshr.u16 q6, q6, #11 \n"
" vldrh.u16 q4, [%[in2]], #16 \n"
" vorr q2, q2, q6 \n"
" vstrh.16 q2, [%[out]], #16 \n"
" letp lr, 2b \n"
"1: \n"
: [in1] "+r"(phwSource), [in2] "+r"(phwBackground),
[out] "+r" (phwDestination), [loopCnt] "+r"(loopCnt)
: [vecMaskR] "t" (vecMaskR), [vecMaskG] "t" (vecMaskG),
[vecMaskBpck] "t" (vecMaskBpck),
[ratio1x8] "r" (ratio1x8), [ratio2x8] "r" (ratio2x8),
[ratio1x4] "r" (ratio1x4), [ratio2x4] "r" (ratio2x4),
[eight] "r" (8), [twofiftysix] "r" (256)
: "q2", "q4", "q5", "q6", "q7", "memory" );
#endif
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_cccn888_alpha_blending_direct)(const uint32_t *pwSource,
const uint32_t *pwBackground,
uint32_t *pwDestination,
uint32_t wPixelCount,
uint_fast16_t hwRatio)
{
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
hwRatio += (hwRatio == 255);
#endif
uint16_t hwRatioCompl = 256 - hwRatio;
#ifdef USE_MVE_INTRINSICS
int32_t blkCnt;
uint16x8_t vecSrc, vecBckg;
vecSrc = vldrbq_u16((uint8_t const *) pwSource);
pwSource += 2;
vecBckg = vldrbq_u16((uint8_t const *) pwBackground);
pwBackground += 2;
blkCnt = wPixelCount;
do {
uint16x8_t vecOut;
vecOut = vmulq_n_u16(vecSrc, (uint16_t) hwRatio);
vecSrc = vldrbq_u16((uint8_t const *) pwSource);
pwSource += 2;
vecOut = vmlaq_n_u16(vecOut, vecBckg, hwRatioCompl);
vecBckg = vldrbq_u16((uint8_t const *) pwBackground);
pwBackground += 2;
vecOut = vecOut >> 8;
vstrbq_u16((uint8_t *) pwDestination, vecOut);
pwDestination += 2;
blkCnt -= 2;
}
while (blkCnt > 0);
#else /* USE_MVE_INTRINSICS */
register unsigned blkCnt __asm("lr") = (wPixelCount * 4);
__asm volatile(
" vldrb.u16 q0, [%[pwSource]], #8 \n"
" vldrb.u16 q1, [%[pwBackg]], #8 \n"
" wlstp.16 lr, %[loopCnt], 1f \n"
"2: \n"
" vmul.u16 q2, q0, %[hwRatio] \n"
" vldrb.u16 q0, [%[pwSource]], #8 \n"
" vmla.s16 q2, q1, %[hwRatioCompl] \n"
" vldrb.u16 q1, [%[pwBackg]], #8 \n"
" vshr.u16 q2, q2, #8 \n"
" vstrb.16 q2, [%[pwDest]], #8 \n"
" letp lr, 2b \n"
"1: \n"
: [pwSource] "+l"(pwSource), [pwBackg] "+l"(pwBackground),
[pwDest] "+l" (pwDestination), [loopCnt] "+r"(blkCnt)
: [hwRatio] "r" (hwRatio), [hwRatioCompl] "r" (hwRatioCompl)
: "q0", "q1", "q2", "memory" );
#endif
}
/* rgb8_draw_pattern helpers */
/*
* enable to pick gather load offset based on initial offset
* e.g. if iOffset = 3
* will get {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2}
*/
static uint8_t __draw_pattern_src_incr_c8bit[32] = {
0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3
};
/*
* enable to pick vector bitmask based on initial offset
* e.g. if iOffset = 3
* will get {8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4}
*/
static uint8_t __draw_pattern_src_bitmask_c8bit[32] = {
1, 2, 4, 8, 16, 32, 64, 128,
1, 2, 4, 8, 16, 32, 64, 128,
1, 2, 4, 8, 16, 32, 64, 128,
1, 2, 4, 8, 16, 32, 64, 128,
};
/* rgb16_draw_pattern helpers */
/*
* enable to pick gather load offset based on initial offset
* e.g. if iOffset = 3
* will get {0, 0, 0, 0, 0, 1, 1, 1}
*/
static uint16_t __draw_pattern_src_incr_rgb16[16] = {
0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1
};
/*
* enable to pick vector bitmask based on initial offset
* e.g. if iOffset = 3
* will get {8, 16, 32, 64, 128, 1, 2, 4}
*/
static uint16_t __draw_pattern_src_bitmask_rgb16[16] = {
1, 2, 4, 8, 16, 32, 64, 128,
1, 2, 4, 8, 16, 32, 64, 128,
};
/* rgb32_draw_pattern helpers */
static uint32_t __draw_pattern_src_incr_rgb32[16] = {
0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1
};
static uint32_t __draw_pattern_src_bitmask_rgb32[16] = {
1, 2, 4, 8, 16, 32, 64, 128,
1, 2, 4, 8, 16, 32, 64, 128,
};
/*! adding support with c code template */
#define __API_COLOUR c8bit
#define __API_ELT_SZ 8
#include "__arm_2d_draw_pattern_helium.inc"
#define __API_COLOUR rgb16
#define __API_ELT_SZ 16
#include "__arm_2d_draw_pattern_helium.inc"
#define __API_COLOUR rgb32
#define __API_ELT_SZ 32
#include "__arm_2d_draw_pattern_helium.inc"
#define __API_COLOUR c8bit
#define __API_ELT_SZ 8
#include "__arm_2d_fill_colour_helium.inc"
#define __API_COLOUR rgb16
#define __API_ELT_SZ 16
#include "__arm_2d_fill_colour_helium.inc"
#define __API_COLOUR rgb32
#define __API_ELT_SZ 32
#include "__arm_2d_fill_colour_helium.inc"
/**
8-bit pixel color fill alpha/channel mask with/without opacity MVE intrinsic generator
- TRGT_LOAD is a contigous / strided target load function
C8BIT_TRGT_LOAD / C8BIT_TRGT_LOAD_STRIDE
- STRIDE is an optional vector of offset for gather load
- SCAL_OPACITY is extra alpha scaling function
C8BIT_SCAL_OPACITY_NONE / C8BIT_SCAL_OPACITY
- OPACITY is an optinal 8-bit vector with duplicated opacity values
(need vector format to be used with VMULH.U8)
- ALPHA_SZ, alpha chan width (1 or 4 for resp. 8 or 32-bit type)
Macro assumes pTarget8/ pAlpha are already setup
*/
#define C8BIT_COLOUR_FILLING_MASK_INNER_MVE(TRGT_LOAD, STRIDE, SCAL_OPACITY, \
OPACITY, ALPHA_SZ, COMPVAL) \
int32_t blkCnt = iWidth; \
do { \
mve_pred16_t tailPred = vctp16q(blkCnt); \
\
uint16x8_t vecTarget = vldrbq_z_u16(pTarget8, tailPred); \
uint16x8_t vecTransp = TRGT_LOAD(pAlpha, STRIDE, tailPred); \
\
vecTransp = SCAL_OPACITY(vecTransp, OPACITY, tailPred); \
\
ALPHA_255_COMP_VEC16(vecTransp, COMPVAL); \
\
uint16x8_t vecAlpha = vsubq_x_u16(v256, vecTransp, tailPred); \
\
vecTarget = vmulq_x(vecTarget, vecAlpha, tailPred); \
vecTarget = vmlaq_m(vecTarget, vecTransp, (uint16_t) Colour, tailPred); \
vecTarget = vecTarget >> 8; \
\
vstrbq_p_u16(pTarget8, vecTarget, tailPred); \
\
pAlpha += (8 * ALPHA_SZ); \
pTarget8 += 8; \
blkCnt -= 8; \
} \
while (blkCnt > 0);
/**
RGB565 pixel color fill alpha/channel mask with/without opacity MVE intrinsic generator
- TRGT_LOAD is a contigous / strided target load function
RGB565_TRGT_LOAD / RGB565_TRGT_LOAD_STRIDE
- STRIDE is an optional vector of offset for gather load
- SCAL_OPACITY is extra alpha scaling function
RGB565_SCAL_OPACITY_NONE / RGB565_SCAL_OPACITY
- OPACITY is an optinal 8-bit vector with duplicated opacity values
(need vector format to be used with VMULH.U8)
- P_ALPHA, 8-bit or 32-bit alpha chan. pointer
- ALPHA_SZ, alpha chan width (1 or 4 for resp. 8 or 32-bit type)
Does not generate a tail-predicated loop as relying on pack/unpack functions.
Predication is only applied in the final stage during pixel store.
*/
#define RGB565_COLOUR_FILLING_MASK_MVE(TRGT_LOAD, STRIDE, SCAL_OPACITY, OPACITY, \
P_ALPHA, ALPHA_SZ, COMPVAL) \
uint16x8_t v256 = vdupq_n_u16(256); \
\
for (int_fast16_t y = 0; y < iHeight; y++) { \
const uint8_t *pAlpha = (const uint8_t *)P_ALPHA; \
uint16_t *pCurTarget = pTarget; \
int32_t blkCnt = iWidth; \
\
do { \
uint16x8_t vecTarget = vld1q(pCurTarget); \
uint16x8_t vecTransp = TRGT_LOAD(pAlpha, STRIDE); \
vecTransp = SCAL_OPACITY(vecTransp, OPACITY); \
\
ALPHA_255_COMP_VEC16(vecTransp, COMPVAL); \
\
uint16x8_t vecAlpha = vsubq_u16(v256, vecTransp); \
uint16x8_t vecR, vecG, vecB; \
\
__arm_2d_rgb565_unpack_single_vec(vecTarget, &vecR, &vecG, &vecB); \
\
/* blending using alpha vector weights */ \
vecR = vmulq(vecR, vecAlpha); \
vecR = vmlaq(vecR, vecTransp, (uint16_t) tSrcPix.R); \
vecR = vecR >> 8; \
\
vecG = vmulq(vecG, vecAlpha); \
vecG = vmlaq(vecG, vecTransp, (uint16_t) tSrcPix.G); \
vecG = vecG >> 8; \
\
vecB = vmulq(vecB, vecAlpha); \
vecB = vmlaq(vecB, vecTransp, (uint16_t) tSrcPix.B); \
vecB = vecB >> 8; \
\
vecTarget = __arm_2d_rgb565_pack_single_vec(vecR, vecG, vecB); \
\
/* tail predication */ \
vst1q_p_u16(pCurTarget, vecTarget, vctp16q(blkCnt)); \
\
pAlpha += (8 * ALPHA_SZ); \
pCurTarget += 8; \
blkCnt -= 8; \
} \
while (blkCnt > 0); \
\
P_ALPHA += (iAlphaStride); \
pTarget += (iTargetStride); \
}
/**
CCCN888 pixel color fill alpha/channel mask with/without opacity MVE intrinsic generator
- TRGT_LOAD is a contigous / strided target load function
CCCN888_TRGT_LOAD / CCCN888_TRGT_LOAD_STRIDE
- STRIDE is an optional vector of offset for gather load
- SCAL_OPACITY is extra alpha scaling function
CCCN888_SCAL_OPACITY_NONE / CCCN888_SCAL_OPACITY
- OPACITY is an optinal 8-bit vector with duplicated opacity values
(need vector format to be used with VMULH.U8)
- ALPHA_SZ, alpha chan width (1 or 4 for resp. 8 or 32-bit type)
Macro assumes pTargetCh0/1/2 & pAlpha are already setup
*/
#define CCCN888_COLOUR_FILLING_MASK_INNER_MVE(TRGT_LOAD, STRIDE, SCAL_OPACITY, \
OPACITY, ALPHA_SZ, COMPVAL) \
int32_t blkCnt = iWidth; \
\
do { \
mve_pred16_t tailPred = vctp16q(blkCnt); \
\
/* expand chan0, chan1, chan2 */ \
uint16x8_t vecTargetC0 = vldrbq_gather_offset_z_u16(pTargetCh0, vStride4Offs, \
tailPred); \
uint16x8_t vecTargetC1 = vldrbq_gather_offset_z_u16(pTargetCh1, vStride4Offs, \
tailPred); \
uint16x8_t vecTargetC2 = vldrbq_gather_offset_z_u16(pTargetCh2, vStride4Offs, \
tailPred); \
uint16x8_t vecTransp = TRGT_LOAD(pAlpha, STRIDE, tailPred); \
\
vecTransp = SCAL_OPACITY(vecTransp, OPACITY, tailPred); \
\
ALPHA_255_COMP_VEC16(vecTransp, COMPVAL); \
\
uint16x8_t vecAlpha = vsubq_x_u16(v256, vecTransp, tailPred); \
\
\
/* scale ch0 vector with alpha vector */ \
vecTargetC0 = vmulq_x(vecTargetC0, vecAlpha, tailPred); \
/* blend ch0 vector with input ch0 color*/ \
vecTargetC0 = vmlaq_m(vecTargetC0, vecTransp, (uint16_t) c0, tailPred); \
vecTargetC0 = vecTargetC0 >> 8; \
\
/* repeat for ch1 and ch2 */ \
vecTargetC1 = vmulq_x(vecTargetC1, vecAlpha, tailPred); \
vecTargetC1 = vmlaq_m(vecTargetC1, vecTransp, (uint16_t) c1, tailPred); \
vecTargetC1 = vecTargetC1 >> 8; \
\
vecTargetC2 = vmulq_x(vecTargetC2, vecAlpha, tailPred); \
vecTargetC2 = vmlaq_m(vecTargetC2, vecTransp, (uint16_t) c2, tailPred); \
vecTargetC2 = vecTargetC2 >> 8; \
\
/* store and merge chan0, chan1, chan2 */ \
vstrbq_scatter_offset_p_u16(pTargetCh0, vStride4Offs, vecTargetC0, tailPred); \
vstrbq_scatter_offset_p_u16(pTargetCh1, vStride4Offs, vecTargetC1, tailPred); \
vstrbq_scatter_offset_p_u16(pTargetCh2, vStride4Offs, vecTargetC2, tailPred); \
\
pAlpha += 8 * ALPHA_SZ; \
pTargetCh0 += 8*4; \
pTargetCh1 += 8*4; \
pTargetCh2 += 8*4; \
blkCnt -= 8; \
} \
while (blkCnt > 0);
#define C8BIT_TRGT_LOAD(base, stride, pred) vldrbq_z_u16(base, pred)
#define C8BIT_TRGT_LOAD_STRIDE(base, stride, pred) vldrbq_gather_offset_z_u16(base, stride, pred);
#define C8BIT_SCAL_OPACITY_NONE(transp, opac, pred) transp
#define C8BIT_SCAL_OPACITY(transp, opac, pred) (uint16x8_t) vmulhq_x((uint8x16_t) transp, opac, pred)
#define RGB565_TRGT_LOAD(base, stride) vldrbq_u16(base)
#define RGB565_TRGT_LOAD_STRIDE(base, stride) vldrbq_gather_offset_u16(base, stride);
#define RGB565_SCAL_OPACITY_NONE(transp, opac) transp
#define RGB565_SCAL_OPACITY(transp, opac) (uint16x8_t) vmulhq((uint8x16_t) transp, opac)
#define CCCN888_TRGT_LOAD(base, stride, pred) vldrbq_z_u16(base, pred)
#define CCCN888_TRGT_LOAD_STRIDE(base, stride, pred) vldrbq_gather_offset_z_u16(base, stride, pred);
#define CCCN888_SCAL_OPACITY_NONE(transp, opac, pred) transp
#define CCCN888_SCAL_OPACITY(transp, opac, pred) (uint16x8_t) vmulhq_x((uint8x16_t) transp, opac, pred)
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_gray8_colour_filling_mask)(uint8_t * __RESTRICT pTarget,
int16_t iTargetStride,
uint8_t * __RESTRICT pchAlpha,
int16_t iAlphaStride,
arm_2d_size_t * __RESTRICT ptCopySize,
uint8_t Colour)
{
int_fast16_t iHeight = ptCopySize->iHeight;
int_fast16_t iWidth = ptCopySize->iWidth;
uint16x8_t v256 = vdupq_n_u16(256);
for (int_fast16_t y = 0; y < iHeight; y++) {
const uint8_t * pAlpha = pchAlpha;
uint8_t * pTarget8 = pTarget;
#ifdef USE_MVE_INTRINSICS
C8BIT_COLOUR_FILLING_MASK_INNER_MVE(C8BIT_TRGT_LOAD, _,
C8BIT_SCAL_OPACITY_NONE, _, 1, 255);
#else
register unsigned blkCnt __asm("lr");
blkCnt = iWidth;
__asm volatile(
"vecAlphaCompl .req q2 \n"
".p2align 2 \n"
" vldrb.u16 q0, [%[pTarget]] \n"
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
" wlstp.16 lr, %[loopCnt], 1f \n"
"2: \n"
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
/* if alpha == 255, boost to 256 */
" vpt.i16 eq, q1, %[alph255] \n"
" vmovt.i16 q1, #256 \n"
#endif
" vsub.i16 vecAlphaCompl, %[vec256], q1 \n"
" vmul.u16 q3, q0, vecAlphaCompl \n"
" vldrb.u16 q0, [%[pTarget], #8] \n"
" vmla.u16 q3, q1, %[Colour] \n"
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
" vshr.u16 q3, q3, #8 \n"
" vstrb.u16 q3, [%[pTarget]], #8 \n"
" letp lr, 2b \n"
"1: \n"
" .unreq vecAlphaCompl \n"
: [pTarget] "+l"(pTarget8), [pAlpha] "+l" (pAlpha), [loopCnt] "+r"(blkCnt)
:[vec256] "t" (v256),[Colour] "r"(Colour)
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
,[alph255] "r" (255)
#endif
:"q0", "q1", "q2", "q3", "memory");
#endif
pchAlpha += (iAlphaStride);
pTarget += (iTargetStride);
}
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_gray8_colour_filling_mask_opacity)(uint8_t * __RESTRICT pTarget,
int16_t iTargetStride,
uint8_t * __RESTRICT pchAlpha,
int16_t iAlphaStride,
arm_2d_size_t *
__RESTRICT ptCopySize,
uint8_t Colour,
uint_fast16_t hwOpacity)
{
int_fast16_t iHeight = ptCopySize->iHeight;
int_fast16_t iWidth = ptCopySize->iWidth;
uint8x16_t vOpacity = vdupq_n_u8(hwOpacity);
uint16x8_t v256 = vdupq_n_u16(256);
for (int_fast16_t y = 0; y < iHeight; y++) {
const uint8_t * pAlpha = pchAlpha;
uint8_t * pTarget8 = pTarget;
#ifdef USE_MVE_INTRINSICS
C8BIT_COLOUR_FILLING_MASK_INNER_MVE(C8BIT_TRGT_LOAD, _,
C8BIT_SCAL_OPACITY, vOpacity, 1, 254);
#else
register unsigned blkCnt __asm("lr");
blkCnt = iWidth;
__asm volatile(
"vecAlphaCompl .req q2 \n"
".p2align 2 \n"
" vldrb.u16 q0, [%[pTarget]] \n"
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
" vmulh.u8 q1, q1, %[vOpacity] \n"
" wlstp.16 lr, %[loopCnt], 1f \n"
"2: \n"
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
/* if vOpacity == 254, boost to 256 */
" vpt.i16 eq, q1, %[opa254] \n"
" vmovt.i16 q1, #256 \n"
#endif
" vsub.i16 vecAlphaCompl, %[vec256], q1 \n"
" vmul.u16 q3, q0, vecAlphaCompl \n"
" vldrb.u16 q0, [%[pTarget], #8] \n"
" vmla.u16 q3, q1, %[Colour] \n"
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
" vmulh.u8 q1, q1, %[vOpacity] \n"
" vshr.u16 q3, q3, #8 \n"
" vstrb.u16 q3, [%[pTarget]], #8 \n"
" letp lr, 2b \n"
"1: \n"
" .unreq vecAlphaCompl \n"
: [pTarget] "+l"(pTarget8), [pAlpha] "+l" (pAlpha), [loopCnt] "+r"(blkCnt)
:[vec256] "t" (v256),[Colour] "r"(Colour),[vOpacity] "t"(vOpacity)
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
,[opa254] "r" (254)
#endif
:"q0", "q1", "q2", "q3", "memory");
#endif
pchAlpha += (iAlphaStride);
pTarget += (iTargetStride);
}
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_gray8_colour_filling_channel_mask)(uint8_t * __RESTRICT pTarget,
int16_t iTargetStride,
uint32_t * __RESTRICT pwAlpha,
int16_t iAlphaStride,
arm_2d_size_t * __RESTRICT ptCopySize,
uint8_t Colour)
{
int_fast16_t iHeight = ptCopySize->iHeight;
int_fast16_t iWidth = ptCopySize->iWidth;
uint16x8_t v256 = vdupq_n_u16(256);
uint16x8_t vStride4Offs = vidupq_n_u16(0, 4);
for (int_fast16_t y = 0; y < iHeight; y++) {
const uint8_t *pAlpha = (const uint8_t *)pwAlpha;
uint8_t * pTarget8 = pTarget;
#ifdef USE_MVE_INTRINSICS
C8BIT_COLOUR_FILLING_MASK_INNER_MVE(C8BIT_TRGT_LOAD_STRIDE, vStride4Offs,
C8BIT_SCAL_OPACITY_NONE, _, 4, 255);
#else
register unsigned blkCnt __asm("lr");
blkCnt = iWidth;
__asm volatile(
"vecAlphaCompl .req q2 \n"
".p2align 2 \n"
" vldrb.u16 q0, [%[pTarget]] \n"
" vldrb.u16 q1, [%[pAlpha], %[str4Offs]] \n"
" wlstp.16 lr, %[loopCnt], 1f \n"
"2: \n"
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
/* if alpha == 255, boost to 256 */
" vpt.i16 eq, q1, %[alph255] \n"
" vmovt.i16 q1, #256 \n"
#endif
" add %[pAlpha], %[pAlpha], #(8*4) \n"
" vsub.i16 vecAlphaCompl, %[vec256], q1 \n"
" vmul.u16 q3, q0, vecAlphaCompl \n"
" vldrb.u16 q0, [%[pTarget], #8] \n"
" vmla.s16 q3, q1, %[Colour] \n"
" vldrb.u16 q1, [%[pAlpha], %[str4Offs]] \n"
" vshr.u16 q3, q3, #8 \n"
" vstrb.u16 q3, [%[pTarget]], #8 \n"
" letp lr, 2b \n"
"1: \n"
" .unreq vecAlphaCompl \n"
: [pTarget] "+l"(pTarget8), [pAlpha] "+r" (pAlpha), [loopCnt] "+r"(blkCnt)
:[vec256] "t" (v256),[Colour] "r"(Colour),[str4Offs] "t"(vStride4Offs)
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
,[alph255] "r" (255)
#endif
:"q0", "q1", "q2", "q3", "memory");
#endif
pwAlpha += (iAlphaStride);
pTarget += (iTargetStride);
}
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_gray8_colour_filling_channel_mask_opacity)(uint8_t * __RESTRICT pTarget,
int16_t iTargetStride,
uint32_t * __RESTRICT pwAlpha,
int16_t iAlphaStride,
arm_2d_size_t *
__RESTRICT ptCopySize,
uint8_t Colour,
uint_fast16_t hwOpacity)
{
int_fast16_t iHeight = ptCopySize->iHeight;
int_fast16_t iWidth = ptCopySize->iWidth;
uint8x16_t vOpacity = vdupq_n_u8(hwOpacity);
uint16x8_t v256 = vdupq_n_u16(256);
uint16x8_t vStride4Offs = vidupq_n_u16(0, 4);
for (int_fast16_t y = 0; y < iHeight; y++) {
const uint8_t *pAlpha = (const uint8_t *)pwAlpha;
uint8_t *pTarget8 = pTarget;
#ifdef USE_MVE_INTRINSICS
C8BIT_COLOUR_FILLING_MASK_INNER_MVE(C8BIT_TRGT_LOAD_STRIDE, vStride4Offs,
C8BIT_SCAL_OPACITY, vOpacity, 4, 254);
#else
register unsigned blkCnt __asm("lr");
blkCnt = iWidth;
__asm volatile(
"vecAlphaCompl .req q2 \n"
".p2align 2 \n"
" vldrb.u16 q0, [%[pTarget]] \n"
" vldrb.u16 q1, [%[pAlpha], %[str4Offs]] \n"
" vmulh.u8 q1, q1, %[vOpacity] \n"
" wlstp.16 lr, %[loopCnt], 1f \n"
"2: \n"
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
/* if vOpacity == 254, boost to 256 */
" vpt.i16 eq, q1, %[opa254] \n"
" vmovt.i16 q1, #256 \n"
#endif
" add %[pAlpha], %[pAlpha], #(8*4) \n"
" vsub.i16 vecAlphaCompl, %[vec256], q1 \n"
" vmul.u16 q3, q0, vecAlphaCompl \n"
" vldrb.u16 q0, [%[pTarget], #8] \n"
" vmla.s16 q3, q1, %[Colour] \n"
" vldrb.u16 q1, [%[pAlpha], %[str4Offs]] \n"
" vmulh.u8 q1, q1, %[vOpacity] \n"
" vshr.u16 q3, q3, #8 \n"
" vstrb.u16 q3, [%[pTarget]], #8 \n"
" letp lr, 2b \n"
"1: \n"
" .unreq vecAlphaCompl \n"
: [pTarget] "+l"(pTarget8), [pAlpha] "+r" (pAlpha), [loopCnt] "+r"(blkCnt)
:[vec256] "t" (v256),[Colour] "r"(Colour),[vOpacity] "t"(vOpacity),
[str4Offs] "t"(vStride4Offs)
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
,[opa254] "r" (254)
#endif
:"q0", "q1", "q2", "q3", "memory");
#endif
pwAlpha += (iAlphaStride);
pTarget += (iTargetStride);
}
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_rgb565_colour_filling_mask)(uint16_t * __RESTRICT pTarget,
int16_t iTargetStride,
uint8_t * __RESTRICT pchAlpha,
int16_t iAlphaStride,
arm_2d_size_t * __RESTRICT ptCopySize,
uint16_t Colour)
{
int_fast16_t iHeight = ptCopySize->iHeight;
int_fast16_t iWidth = ptCopySize->iWidth;
__arm_2d_color_fast_rgb_t tSrcPix;
__arm_2d_rgb565_unpack(*(&Colour), &tSrcPix);
#ifdef USE_MVE_INTRINSICS
RGB565_COLOUR_FILLING_MASK_MVE( RGB565_TRGT_LOAD, _,
RGB565_SCAL_OPACITY_NONE, _, pchAlpha, 1, 255);
#else
/* RGB565 pack/unpack Masks */
/* use memory rather than vmov to optimize Helium operations interleaving */
uint16x8_t scratch[5];
// Unpacking Mask Red
vst1q((uint16_t*)&scratch[0], vdupq_n_u16(0x1f));
// Unpacking Mask Green
vst1q((uint16_t*)&scratch[1], vdupq_n_u16(0x3f));
// packing Mask Green
vst1q((uint16_t*)&scratch[2], vdupq_n_u16(0xfc));
// packing Mask Blue
vst1q((uint16_t*)&scratch[3], vdupq_n_u16(0xf8));
for (int_fast16_t y = 0; y < iHeight; y++) {
const uint8_t *pAlpha = pchAlpha;
uint16_t *pCurTarget = pTarget;
register unsigned blkCnt __asm("lr");
blkCnt = iWidth;
__asm volatile (
".p2align 2 \n"
/* load scheduling */
" vldrh.u16 q0, [%[pTarget]] \n"
" vmov.i16 q7, #0x0100 \n"
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
" wlstp.16 lr, %[loopCnt], 1f \n"
"2: \n"
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
/* if alpha == 255, boost to 256 */
" vpt.i16 eq, q1, %[alph255] \n"
" vmovt.i16 q1, #256 \n"
#endif
// vecAlpha
" vsub.i16 q2, q7, q1 \n"
/* RGB565 unpack */
/* vecAlpha * 4 for G channel upscale */
" vmul.i16 q2, q2, %[four] \n"
/* G channel extract */
" vshr.u16 q5, q0, #5 \n"
/* load Unpacking Mask for R channel */
" vldrh.u16 q7, [%[scratch], #(0*16)] \n"
" vand q4, q0, q7 \n"
/* load Unpacking Mask for G channel */
" vldrh.u16 q7, [%[scratch], #(1*16)] \n"
" vand q5, q5, q7 \n"
/* scale G vector with alpha vector */
" vmul.u16 q5, q5, q2 \n"
/* B channel */
" vshr.u16 q6, q0, #11 \n"
/* blend G vector with input G color*/
" vmla.s16 q5, q1, %[G] \n"
/* vecAlpha * 8 for R & B upscale */
" vshl.i16 q2, q2, #1 \n"
/* scale R vector with alpha vec */
" vmul.u16 q4, q4, q2 \n"
" vshr.u16 q5, q5, #8 \n"
/* blend R vector with input R color*/
" vmla.s16 q4, q1, %[B] \n"
/* load packing Mask for G channel */
" vldrh.u16 q7, [%[scratch], #(2*16)] \n"
/* scale B vector with alpha vector */
" vmul.u16 q6, q6, q2 \n"
" vand q5, q5, q7 \n"
/* blend B vector with input B color*/
" vmla.s16 q6, q1, %[R] \n"
/* load packing Mask for B channel */
" vldrh.u16 q7, [%[scratch], #(3*16)] \n"
" vshr.u16 q6, q6, #8 \n"
/* RGB 565 pack */
/* (G & 0x00fc), 8) */
" vmul.i16 q5, q5, %[eight] \n"
/* (B & 0x00f8) */
" vand q6, q6, q7 \n"
/* load next alpha vector */
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
" vmov.i16 q7, #0x0100 \n"
/* pack G & B */
" vmla.s16 q5, q6, %[twofiftysix] \n"
/* combined (R >> 8) >> 3 */
" vshr.u16 q4, q4, #11 \n"
/* load next target */
" vldrh.u16 q0, [%[pTarget], #16] \n"
/* pack R */
" vorr q4, q4, q5 \n"
" vstrh.16 q4, [%[pTarget]], #16 \n"
" letp lr, 2b \n"
"1: \n"
:[pTarget]"+l"(pCurTarget),[pAlpha] "+l"(pAlpha),[loopCnt] "+r"(blkCnt)
:[Colour] "r"(Colour), [eight] "r" (8), [four] "r" (4),
[R] "r" (tSrcPix.R), [G] "r" (tSrcPix.G), [B] "r" (tSrcPix.B),
[twofiftysix] "r" (256), [scratch] "r" (scratch)
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
,[alph255] "r" (255)
#endif
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory");
pchAlpha += (iAlphaStride);
pTarget += (iTargetStride);
}
#endif
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_rgb565_colour_filling_mask_opacity)(uint16_t * __RESTRICT pTarget,
int16_t iTargetStride,
uint8_t * __RESTRICT pchAlpha,
int16_t iAlphaStride,
arm_2d_size_t * __RESTRICT ptCopySize,
uint16_t Colour,
uint_fast16_t hwOpacity)
{
int_fast16_t iHeight = ptCopySize->iHeight;
int_fast16_t iWidth = ptCopySize->iWidth;
uint8x16_t vOpacity = vdupq_n_u8(hwOpacity);
__arm_2d_color_fast_rgb_t tSrcPix;
__arm_2d_rgb565_unpack(*(&Colour), &tSrcPix);
#ifdef USE_MVE_INTRINSICS
RGB565_COLOUR_FILLING_MASK_MVE( RGB565_TRGT_LOAD, _,
RGB565_SCAL_OPACITY, vOpacity, pchAlpha, 1, 254);
#else
/* RGB565 pack/unpack Masks + opacity */
/* use memory rather than vmov to optimize Helium operations interleaving */
uint16x8_t scratch[6];
// Unpacking Mask Red
vst1q((uint16_t*)&scratch[0], vdupq_n_u16(0x1f));
// Unpacking Mask Green
vst1q((uint16_t*)&scratch[1], vdupq_n_u16(0x3f));
// packing Mask Green
vst1q((uint16_t*)&scratch[2], vdupq_n_u16(0xfc));
// packing Mask Blue
vst1q((uint16_t*)&scratch[3], vdupq_n_u16(0xf8));
// opacity
vst1q((uint16_t*)&scratch[4], (uint16x8_t)vOpacity);
for (int_fast16_t y = 0; y < iHeight; y++) {
const uint8_t *pAlpha = pchAlpha;
uint16_t *pCurTarget = pTarget;
register unsigned blkCnt __asm("lr");
blkCnt = iWidth;
__asm volatile (
".p2align 2 \n"
/* load scheduling */
" vldrh.u16 q0, [%[pTarget]] \n"
" vmov.i16 q7, #0x0100 \n"
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
/* opacity vector */
" vldrh.u16 q6, [%[scratch], #(4*16)] \n"
" vmulh.u8 q1, q1, q6 \n"
" wlstp.16 lr, %[loopCnt], 1f \n"
"2: \n"
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
/* if vOpacity == 254, boost to 256 */
" vpt.i16 eq, q1, %[opa254] \n"
" vmovt.i16 q1, #256 \n"
#endif
// vecAlpha
" vsub.i16 q2, q7, q1 \n"
/* RGB565 unpack */
/* vecAlpha * 4 for G channel upscale */
" vmul.i16 q2, q2, %[four] \n"
/* G channel extract */
" vshr.u16 q5, q0, #5 \n"
/* load Unpacking Mask for R channel */
" vldrh.u16 q7, [%[scratch], #(0*16)] \n"
" vand q4, q0, q7 \n"
/* load Unpacking Mask for G channel */
" vldrh.u16 q7, [%[scratch], #(1*16)] \n"
" vand q5, q5, q7 \n"
/* scale G vector with alpha vector */
" vmul.u16 q5, q5, q2 \n"
/* B channel */
" vshr.u16 q6, q0, #11 \n"
/* blend G vector with input G color*/
" vmla.s16 q5, q1, %[G] \n"
/* vecAlpha * 8 for R & B upscale */
" vshl.i16 q2, q2, #1 \n"
/* scale R vector with alpha vec */
" vmul.u16 q4, q4, q2 \n"
" vshr.u16 q5, q5, #8 \n"
/* blend R vector with input R color*/
" vmla.s16 q4, q1, %[B] \n"
/* load packing Mask for G channel */
" vldrh.u16 q7, [%[scratch], #(2*16)] \n"
/* scale B vector with alpha vector */
" vmul.u16 q6, q6, q2 \n"
" vand q5, q5, q7 \n"
/* blend B vector with input B color*/
" vmla.s16 q6, q1, %[R] \n"
/* load packing Mask for B channel */
" vldrh.u16 q7, [%[scratch], #(3*16)] \n"
" vshr.u16 q6, q6, #8 \n"
/* RGB 565 pack */
/* (G & 0x00fc), 8) */
" vmul.i16 q5, q5, %[eight] \n"
/* (B & 0x00f8) */
" vand q6, q6, q7 \n"
/* load next alpha vector */
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
" vmov.i16 q7, #0x0100 \n"
/* pack G & B */
" vmla.s16 q5, q6, %[twofiftysix] \n"
/* reload opacity and scale alpha */
" vldrh.u16 q6, [%[scratch], #(4*16)] \n"
" vmulh.u8 q1, q1, q6 \n"
/* combined (R >> 8) >> 3 */
" vshr.u16 q4, q4, #11 \n"
/* load next target */
" vldrh.u16 q0, [%[pTarget], #16] \n"
/* pack R */
" vorr q4, q4, q5 \n"
" vstrh.16 q4, [%[pTarget]], #16 \n"
" letp lr, 2b \n"
"1: \n"
:[pTarget]"+r"(pCurTarget),[pAlpha] "+l"(pAlpha),[loopCnt] "+r"(blkCnt)
:[Colour] "r"(Colour), [eight] "r" (8), [four] "r" (4),
[R] "r" (tSrcPix.R), [G] "r" (tSrcPix.G), [B] "r" (tSrcPix.B),
[twofiftysix] "r" (256), [scratch] "r" (scratch)
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
,[opa254] "r" (254)
#endif
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory");
pchAlpha += (iAlphaStride);
pTarget += (iTargetStride);
}
#endif
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_rgb565_colour_filling_channel_mask)(uint16_t * __RESTRICT pTarget,
int16_t iTargetStride,
uint32_t * __RESTRICT pwAlpha,
int16_t iAlphaStride,
arm_2d_size_t * __RESTRICT ptCopySize,
uint16_t Colour)
{
int_fast16_t iHeight = ptCopySize->iHeight;
int_fast16_t iWidth = ptCopySize->iWidth;
uint16x8_t vStride4Offs = vidupq_n_u16(0, 4);
__arm_2d_color_fast_rgb_t tSrcPix;
__arm_2d_rgb565_unpack(*(&Colour), &tSrcPix);
#ifdef USE_MVE_INTRINSICS
RGB565_COLOUR_FILLING_MASK_MVE(RGB565_TRGT_LOAD_STRIDE, vStride4Offs,
RGB565_SCAL_OPACITY_NONE, _, pwAlpha, 4, 1);
#else
/* RGB565 pack/unpack Masks */
/* use memory rather than vmov to optimize Helium operations interleaving */
uint16x8_t scratch[4];
// Unpacking Mask Red
vst1q((uint16_t*)&scratch[0], vdupq_n_u16(0x1f));
// Unpacking Mask Green
vst1q((uint16_t*)&scratch[1], vdupq_n_u16(0x3f));
// packing Mask Green
vst1q((uint16_t*)&scratch[2], vdupq_n_u16(0xfc));
// packing Mask Blue
vst1q((uint16_t*)&scratch[3], vdupq_n_u16(0xf8));
for (int_fast16_t y = 0; y < iHeight; y++) {
const uint32_t *pAlpha = pwAlpha;
uint16_t *pCurTarget = pTarget;
register unsigned blkCnt __asm("lr");
blkCnt = iWidth;
__asm volatile (
".p2align 2 \n"
/* load scheduling */
" vldrh.u16 q0, [%[pTarget]] \n"
" vmov.i16 q7, #0x0100 \n"
" vldrb.u16 q1, [%[pAlpha],%[str4Offs]]\n"
" wlstp.16 lr, %[loopCnt], 1f \n"
"2: \n"
" add %[pAlpha], %[pAlpha],#(8*4)\n"
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
/* if alpha == 255, boost to 256 */
" vpt.i16 eq, q1, %[alph255] \n"
" vmovt.i16 q1, #256 \n"
#endif
// vecAlpha
" vsub.i16 q2, q7, q1 \n"
/* RGB565 unpack */
/* vecAlpha * 4 for G channel upscale */
" vmul.i16 q2, q2, %[four] \n"
/* G channel extract */
" vshr.u16 q5, q0, #5 \n"
/* load Unpacking Mask for R channel */
" vldrh.u16 q7, [%[scratch], #(0*16)] \n"
" vand q4, q0, q7 \n"
/* load Unpacking Mask for G channel */
" vldrh.u16 q7, [%[scratch], #(1*16)] \n"
" vand q5, q5, q7 \n"
/* scale G vector with alpha vector */
" vmul.u16 q5, q5, q2 \n"
/* B channel */
" vshr.u16 q6, q0, #11 \n"
/* blend G vector with input G color*/
" vmla.s16 q5, q1, %[G] \n"
/* vecAlpha * 8 for R & B upscale */
" vshl.i16 q2, q2, #1 \n"
/* scale R vector with alpha vec */
" vmul.u16 q4, q4, q2 \n"
" vshr.u16 q5, q5, #8 \n"
/* blend R vector with input R color*/
" vmla.s16 q4, q1, %[B] \n"
/* load packing Mask for G channel */
" vldrh.u16 q7, [%[scratch], #(2*16)] \n"
/* scale B vector with alpha vector */
" vmul.u16 q6, q6, q2 \n"
" vand q5, q5, q7 \n"
/* blend B vector with input B color*/
" vmla.s16 q6, q1, %[R] \n"
/* load packing Mask for B channel */
" vldrh.u16 q7, [%[scratch], #(3*16)] \n"
" vshr.u16 q6, q6, #8 \n"
/* RGB 565 pack */
/* (G & 0x00fc), 8) */
" vmul.i16 q5, q5, %[eight] \n"
/* (B & 0x00f8) */
" vand q6, q6, q7 \n"
/* load next alpha vector */
" vldrb.u16 q1, [%[pAlpha],%[str4Offs]]\n"
" vmov.i16 q7, #0x0100 \n"
/* pack G & B */
" vmla.s16 q5, q6, %[twofiftysix] \n"
/* combined (R >> 8) >> 3 */
" vshr.u16 q4, q4, #11 \n"
/* load next target */
" vldrh.u16 q0, [%[pTarget], #16] \n"
/* pack R */
" vorr q4, q4, q5 \n"
" vstrh.16 q4, [%[pTarget]], #16 \n"
" letp lr, 2b \n"
"1: \n"
:[pTarget]"+r"(pCurTarget),[pAlpha] "+r"(pAlpha),[loopCnt] "+r"(blkCnt)
:[Colour] "r"(Colour), [eight] "r" (8), [four] "r" (4),
[R] "r" (tSrcPix.R), [G] "r" (tSrcPix.G), [B] "r" (tSrcPix.B),
[twofiftysix] "r" (256), [scratch] "r" (scratch), [str4Offs] "t"(vStride4Offs)
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
,[alph255] "r" (255)
#endif
:"q0", "q1", "q2", "q4", "q5", "q6", "q7", "memory");
pwAlpha += (iAlphaStride);
pTarget += (iTargetStride);
}
#endif
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_rgb565_colour_filling_channel_mask_opacity)(uint16_t * __RESTRICT pTarget,
int16_t iTargetStride,
uint32_t * __RESTRICT pwAlpha,
int16_t iAlphaStride,
arm_2d_size_t * __RESTRICT ptCopySize,
uint16_t Colour,
uint_fast16_t hwOpacity)
{
int_fast16_t iHeight = ptCopySize->iHeight;
int_fast16_t iWidth = ptCopySize->iWidth;
uint16x8_t vStride4Offs = vidupq_n_u16(0, 4);
uint8x16_t vOpacity = vdupq_n_u8(hwOpacity);
__arm_2d_color_fast_rgb_t tSrcPix;
__arm_2d_rgb565_unpack(*(&Colour), &tSrcPix);
#ifdef USE_MVE_INTRINSICS
RGB565_COLOUR_FILLING_MASK_MVE(RGB565_TRGT_LOAD_STRIDE, vStride4Offs,
RGB565_SCAL_OPACITY, vOpacity, pwAlpha, 4, 2);
#else
/* RGB565 pack/unpack Masks + opacity */
/* use memory rather than vmov to optimize Helium operations interleaving */
uint16x8_t scratch[5];
// Unpacking Mask Red
vst1q((uint16_t*)&scratch[0], vdupq_n_u16(0x1f));
// Unpacking Mask Green
vst1q((uint16_t*)&scratch[1], vdupq_n_u16(0x3f));
// packing Mask Green
vst1q((uint16_t*)&scratch[2], vdupq_n_u16(0xfc));
// packing Mask Blue
vst1q((uint16_t*)&scratch[3], vdupq_n_u16(0xf8));
// opacity
vst1q((uint16_t*)&scratch[4], (uint16x8_t)vOpacity);
for (int_fast16_t y = 0; y < iHeight; y++) {
const uint32_t *pAlpha = pwAlpha;
uint16_t *pCurTarget = pTarget;
register unsigned blkCnt __asm("lr");
blkCnt = iWidth;
__asm volatile (
".p2align 2 \n"
/* load scheduling */
" vldrh.u16 q0, [%[pTarget]] \n"
" vmov.i16 q7, #0x0100 \n"
" vldrb.u16 q1, [%[pAlpha],%[str4Offs]]\n"
/* opacity vector */
" vldrh.u16 q6, [%[scratch], #(4*16)] \n"
" vmulh.u8 q1, q1, q6 \n"
" wlstp.16 lr, %[loopCnt], 1f \n"
"2: \n"
" add %[pAlpha], %[pAlpha],#(8*4)\n"
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
/* if vOpacity == 254, boost to 256 */
" vpt.i16 eq, q1, %[opa254] \n"
" vmovt.i16 q1, #256 \n"
#endif
// vecAlpha
" vsub.i16 q2, q7, q1 \n"
/* RGB565 unpack */
/* vecAlpha * 4 for G channel upscale */
" vmul.i16 q2, q2, %[four] \n"
/* G channel extract */
" vshr.u16 q5, q0, #5 \n"
/* load Unpacking Mask for R channel */
" vldrh.u16 q7, [%[scratch], #(0*16)] \n"
" vand q4, q0, q7 \n"
/* load Unpacking Mask for G channel */
" vldrh.u16 q7, [%[scratch], #(1*16)] \n"
" vand q5, q5, q7 \n"
/* scale G vector with alpha vector */
" vmul.u16 q5, q5, q2 \n"
/* B channel */
" vshr.u16 q6, q0, #11 \n"
/* blend G vector with input G color*/
" vmla.s16 q5, q1, %[G] \n"
/* vecAlpha * 8 for R & B upscale */
" vshl.i16 q2, q2, #1 \n"
/* scale R vector with alpha vec */
" vmul.u16 q4, q4, q2 \n"
" vshr.u16 q5, q5, #8 \n"
/* blend R vector with input R color*/
" vmla.s16 q4, q1, %[B] \n"
/* load packing Mask for G channel */
" vldrh.u16 q7, [%[scratch], #(2*16)] \n"
/* scale B vector with alpha vector */
" vmul.u16 q6, q6, q2 \n"
" vand q5, q5, q7 \n"
/* blend B vector with input B color*/
" vmla.s16 q6, q1, %[R] \n"
/* load packing Mask for B channel */
" vldrh.u16 q7, [%[scratch], #(3*16)] \n"
" vshr.u16 q6, q6, #8 \n"
/* RGB 565 pack */
/* (G & 0x00fc), 8) */
" vmul.i16 q5, q5, %[eight] \n"
/* (B & 0x00f8) */
" vand q6, q6, q7 \n"
/* load next alpha vector */
" vldrb.u16 q1, [%[pAlpha],%[str4Offs]]\n"
" vmov.i16 q7, #0x0100 \n"
/* pack G & B */
" vmla.s16 q5, q6, %[twofiftysix] \n"
/* combined (R >> 8) >> 3 */
" vldrh.u16 q6, [%[scratch], #(4*16)] \n"
" vmulh.u8 q1, q1, q6 \n"
" vshr.u16 q4, q4, #11 \n"
/* load next target */
" vldrh.u16 q0, [%[pTarget], #16] \n"
/* pack R */
" vorr q4, q4, q5 \n"
" vstrh.16 q4, [%[pTarget]], #16 \n"
" letp lr, 2b \n"
"1: \n"
:[pTarget]"+r"(pCurTarget),[pAlpha] "+r"(pAlpha),[loopCnt] "+r"(blkCnt)
:[Colour] "r"(Colour), [eight] "r" (8), [four] "r" (4),
[R] "r" (tSrcPix.R), [G] "r" (tSrcPix.G), [B] "r" (tSrcPix.B),
[twofiftysix] "r" (256), [scratch] "r" (scratch), [str4Offs] "t"(vStride4Offs)
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
,[opa254] "r" (254)
#endif
:"q0", "q1", "q2", "q4", "q5", "q6", "q7", "memory");
pwAlpha += (iAlphaStride);
pTarget += (iTargetStride);
}
#endif
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_cccn888_colour_filling_mask)(uint32_t * __RESTRICT pTarget,
int16_t iTargetStride,
uint8_t * __RESTRICT pchAlpha,
int16_t iAlphaStride,
arm_2d_size_t * __RESTRICT ptCopySize,
uint32_t Colour)
{
int_fast16_t iHeight = ptCopySize->iHeight;
int_fast16_t iWidth = ptCopySize->iWidth;
uint16x8_t v256 = vdupq_n_u16(256);
uint16x8_t vStride4Offs = vidupq_n_u16(0, 4);
uint16_t c0, c1, c2;
c0 = Colour & 0xff;
c1 = (Colour >> 8) & 0xff;
c2 = (Colour >> 16) & 0xff;
for (int_fast16_t y = 0; y < iHeight; y++) {
const uint8_t * pAlpha = pchAlpha;
uint8_t * pTargetCh0 = (uint8_t*)pTarget;
uint8_t * pTargetCh1 = pTargetCh0 + 1;
uint8_t * pTargetCh2 = pTargetCh0 + 2;
#ifdef USE_MVE_INTRINSICS
CCCN888_COLOUR_FILLING_MASK_INNER_MVE(CCCN888_TRGT_LOAD, _,
CCCN888_SCAL_OPACITY_NONE, _, 1, 255);
#else
register unsigned blkCnt __asm("lr");
blkCnt = iWidth;
__asm volatile(
"vecAlphaCompl .req q2 \n"
".p2align 2 \n"
/* expand chan0 */
" vldrb.u16 q0, [%[pTargetCh0], %[str4Offs]] \n"
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
" wlstp.16 lr, %[loopCnt], 1f \n"
"2: \n"
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
/* if alpha == 255, boost to 256 */
" vpt.i16 eq, q1, %[alph255] \n"
" vmovt.i16 q1, #256 \n"
#endif
" vsub.i16 vecAlphaCompl, %[vec256], q1 \n"
/* scale ch0 vector with alpha vector */
" vmul.u16 q3, q0, vecAlphaCompl \n"
/* expand chan1 */
" vldrb.u16 q0, [%[pTargetCh1], %[str4Offs]] \n"
/* blend ch0 vector with input ch0 color*/
" vmla.s16 q3, q1, %[c0] \n"
" vshr.u16 q3, q3, #8 \n"
" vstrb.u16 q3, [%[pTargetCh0], %[str4Offs]] \n"
/* scale ch1 vector with alpha vector */
" vmul.u16 q3, q0, vecAlphaCompl \n"
/* expand chan2 */
" vldrb.u16 q0, [%[pTargetCh2], %[str4Offs]] \n"
/* blend ch1 vector with input ch1 color*/
" vmla.s16 q3, q1, %[c1] \n"
" vshr.u16 q3, q3, #8 \n"
" vstrb.u16 q3, [%[pTargetCh1], %[str4Offs]] \n"
" adds %[pTargetCh0], #32 \n"
" adds %[pTargetCh1], #32 \n"
/* scale ch2 vector with alpha vector */
" vmul.u16 q3, q0, vecAlphaCompl \n"
" vldrb.u16 q0, [%[pTargetCh0], %[str4Offs]] \n"
/* blend ch2 vector with input ch2 color*/
" vmla.s16 q3, q1, %[c2] \n"
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
" vshr.u16 q3, q3, #8 \n"
" vstrb.u16 q3, [%[pTargetCh2], %[str4Offs]] \n"
" add.w %[pTargetCh2], %[pTargetCh2], #32 \n"
" letp lr, 2b \n"
"1: \n"
" .unreq vecAlphaCompl \n"
:[pTargetCh0] "+r"(pTargetCh0), [pTargetCh1] "+r"(pTargetCh1),
[pTargetCh2] "+r"(pTargetCh2), [pAlpha] "+l" (pAlpha), [loopCnt] "+r"(blkCnt)
:[vec256] "t" (v256),[str4Offs] "t" (vStride4Offs),
[c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2)
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
,[alph255] "r" (255)
#endif
:"q0", "q1", "q2", "q3", "memory", "cc");
#endif
pchAlpha += (iAlphaStride);
pTarget += (iTargetStride);
}
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_cccn888_colour_filling_mask_opacity)(uint32_t * __RESTRICT pTarget,
int16_t iTargetStride,
uint8_t * __RESTRICT pchAlpha,
int16_t iAlphaStride,
arm_2d_size_t * __RESTRICT ptCopySize,
uint32_t Colour,
uint_fast16_t hwOpacity)
{
int_fast16_t iHeight = ptCopySize->iHeight;
int_fast16_t iWidth = ptCopySize->iWidth;
uint16x8_t v256 = vdupq_n_u16(256);
uint16x8_t vStride4Offs = vidupq_n_u16(0, 4);
uint8x16_t vOpacity = vdupq_n_u8(hwOpacity);
uint16_t c0, c1, c2;
c0 = Colour & 0xff;
c1 = (Colour >> 8) & 0xff;
c2 = (Colour >> 16) & 0xff;
for (int_fast16_t y = 0; y < iHeight; y++) {
const uint8_t * pAlpha = pchAlpha;
uint8_t * pTargetCh0 = (uint8_t*)pTarget;
uint8_t * pTargetCh1 = pTargetCh0 + 1;
uint8_t * pTargetCh2 = pTargetCh0 + 2;
#ifdef USE_MVE_INTRINSICS
CCCN888_COLOUR_FILLING_MASK_INNER_MVE(CCCN888_TRGT_LOAD, _,
CCCN888_SCAL_OPACITY, vOpacity, 1, 254);
#else
register unsigned blkCnt __asm("lr");
blkCnt = iWidth;
__asm volatile(
"vecAlphaCompl .req q2 \n"
".p2align 2 \n"
/* expand chan0 */
" vldrb.u16 q0, [%[pTargetCh0], %[str4Offs]] \n"
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
" vmulh.u8 q1, q1, %[vOpacity] \n"
" wlstp.16 lr, %[loopCnt], 1f \n"
"2: \n"
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
/* if vOpacity == 254, boost to 256 */
" vpt.i16 eq, q1, %[opa254] \n"
" vmovt.i16 q1, #256 \n"
#endif
" vsub.i16 vecAlphaCompl, %[vec256], q1 \n"
/* scale ch0 vector with alpha vector */
" vmul.u16 q3, q0, vecAlphaCompl \n"
/* expand chan1 */
" vldrb.u16 q0, [%[pTargetCh1], %[str4Offs]] \n"
/* blend ch0 vector with input ch0 color*/
" vmla.s16 q3, q1, %[c0] \n"
" vshr.u16 q3, q3, #8 \n"
" vstrb.u16 q3, [%[pTargetCh0], %[str4Offs]] \n"
/* scale ch1 vector with alpha vector */
" vmul.u16 q3, q0, vecAlphaCompl \n"
/* expand chan2 */
" vldrb.u16 q0, [%[pTargetCh2], %[str4Offs]] \n"
/* blend ch1 vector with input ch1 color*/
" vmla.s16 q3, q1, %[c1] \n"
" vshr.u16 q3, q3, #8 \n"
" vstrb.u16 q3, [%[pTargetCh1], %[str4Offs]] \n"
" adds %[pTargetCh0], #32 \n"
" adds %[pTargetCh1], #32 \n"
/* scale ch2 vector with alpha vector */
" vmul.u16 q3, q0, vecAlphaCompl \n"
" vldrb.u16 q0, [%[pTargetCh0], %[str4Offs]] \n"
/* blend ch2 vector with input ch2 color*/
" vmla.s16 q3, q1, %[c2] \n"
" vldrb.u16 q1, [%[pAlpha]], #8 \n"
" vmulh.u8 q1, q1, %[vOpacity] \n"
" vshr.u16 q3, q3, #8 \n"
" vstrb.u16 q3, [%[pTargetCh2], %[str4Offs]] \n"
" add.w %[pTargetCh2], %[pTargetCh2], #32 \n"
" letp lr, 2b \n"
"1: \n"
:[pTargetCh0] "+r"(pTargetCh0), [pTargetCh1] "+r"(pTargetCh1),
[pTargetCh2] "+r"(pTargetCh2), [pAlpha] "+l" (pAlpha), [loopCnt] "+r"(blkCnt)
:[vec256] "t" (v256),[str4Offs] "t" (vStride4Offs),
[vOpacity] "t"(vOpacity),
[c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2)
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
,[opa254] "r" (254)
#endif
:"q0", "q1", "q2", "q3", "memory", "cc");
#endif
pchAlpha += (iAlphaStride);
pTarget += (iTargetStride);
}
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_cccn888_colour_filling_channel_mask)(uint32_t * __RESTRICT pTarget,
int16_t iTargetStride,
uint32_t * __RESTRICT pwAlpha,
int16_t iAlphaStride,
arm_2d_size_t * __RESTRICT ptCopySize,
uint32_t Colour)
{
int_fast16_t iHeight = ptCopySize->iHeight;
int_fast16_t iWidth = ptCopySize->iWidth;
uint16x8_t v256 = vdupq_n_u16(256);
uint16x8_t vStride4Offs = vidupq_n_u16(0, 4);
uint16_t c0, c1, c2;
c0 = Colour & 0xff;
c1 = (Colour >> 8) & 0xff;
c2 = (Colour >> 16) & 0xff;
for (int_fast16_t y = 0; y < iHeight; y++) {
const uint8_t *pAlpha = (const uint8_t *)pwAlpha;
uint8_t * pTargetCh0 = (uint8_t*)pTarget;
uint8_t * pTargetCh1 = pTargetCh0 + 1;
uint8_t * pTargetCh2 = pTargetCh0 + 2;
#ifdef USE_MVE_INTRINSICS
CCCN888_COLOUR_FILLING_MASK_INNER_MVE(CCCN888_TRGT_LOAD_STRIDE, vStride4Offs,
CCCN888_SCAL_OPACITY_NONE, _, 4, 255);
#else
register unsigned blkCnt __asm("lr");
blkCnt = iWidth;
__asm volatile(
"vecAlphaCompl .req q2 \n"
".p2align 2 \n"
/* expand chan0 */
" vldrb.u16 q0, [%[pTargetCh0], %[str4Offs]] \n"
" vldrb.u16 q1, [%[pAlpha], %[str4Offs]] \n"
" wlstp.16 lr, %[loopCnt], 1f \n"
"2: \n"
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
/* if alpha == 255, boost to 256 */
" vpt.i16 eq, q1, %[alph255] \n"
" vmovt.i16 q1, #256 \n"
#endif
" vsub.i16 vecAlphaCompl, %[vec256], q1 \n"
/* scale ch0 vector with alpha vector */
" vmul.u16 q3, q0, vecAlphaCompl \n"
/* expand chan1 */
" vldrb.u16 q0, [%[pTargetCh1], %[str4Offs]] \n"
/* blend ch0 vector with input ch0 color*/
" vmla.s16 q3, q1, %[c0] \n"
" vshr.u16 q3, q3, #8 \n"
" vstrb.u16 q3, [%[pTargetCh0], %[str4Offs]] \n"
/* scale ch1 vector with alpha vector */
" vmul.u16 q3, q0, vecAlphaCompl \n"
/* expand chan2 */
" vldrb.u16 q0, [%[pTargetCh2], %[str4Offs]] \n"
/* blend ch1 vector with input ch1 color*/
" vmla.s16 q3, q1, %[c1] \n"
" vshr.u16 q3, q3, #8 \n"
" vstrb.u16 q3, [%[pTargetCh1], %[str4Offs]] \n"
" adds %[pAlpha], #32 \n"
" adds %[pTargetCh0], #32 \n"
/* scale ch2 vector with alpha vector */
" vmul.u16 q3, q0, vecAlphaCompl \n"
" vldrb.u16 q0, [%[pTargetCh0], %[str4Offs]] \n"
/* blend ch2 vector with input ch2 color*/
" vmla.s16 q3, q1, %[c2] \n"
" vldrb.u16 q1, [%[pAlpha], %[str4Offs]] \n"
" vshr.u16 q3, q3, #8 \n"
" vstrb.u16 q3, [%[pTargetCh2], %[str4Offs]] \n"
" adds %[pTargetCh1], #32 \n"
" adds %[pTargetCh2], #32 \n"
" letp lr, 2b \n"
"1: \n"
:[pTargetCh0] "+r"(pTargetCh0), [pTargetCh1] "+r"(pTargetCh1),
[pTargetCh2] "+r"(pTargetCh2), [pAlpha] "+r" (pAlpha), [loopCnt] "+r"(blkCnt)
:[vec256] "t" (v256),[str4Offs] "t" (vStride4Offs),
[c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2)
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
,[alph255] "r" (255)
#endif
:"q0", "q1", "q2", "q3", "memory", "cc");
#endif
pwAlpha += (iAlphaStride);
pTarget += (iTargetStride);
}
}
__OVERRIDE_WEAK
void __MVE_WRAPPER( __arm_2d_impl_cccn888_colour_filling_channel_mask_opacity)(uint32_t * __RESTRICT pTarget,
int16_t iTargetStride,
uint32_t * __RESTRICT pwAlpha,
int16_t iAlphaStride,
arm_2d_size_t *
__RESTRICT ptCopySize,
uint32_t Colour,
uint_fast16_t hwOpacity)
{
int_fast16_t iHeight = ptCopySize->iHeight;
int_fast16_t iWidth = ptCopySize->iWidth;
uint16x8_t v256 = vdupq_n_u16(256);
uint16x8_t vStride4Offs = vidupq_n_u16(0, 4);
uint8x16_t vOpacity = vdupq_n_u8(hwOpacity);
uint16_t c0, c1, c2;
c0 = Colour & 0xff;
c1 = (Colour >> 8) & 0xff;
c2 = (Colour >> 16) & 0xff;
for (int_fast16_t y = 0; y < iHeight; y++) {
const uint8_t *pAlpha = (const uint8_t *)pwAlpha;
uint8_t * pTargetCh0 = (uint8_t*)pTarget;
uint8_t * pTargetCh1 = pTargetCh0 + 1;
uint8_t * pTargetCh2 = pTargetCh0 + 2;
#ifdef USE_MVE_INTRINSICS
CCCN888_COLOUR_FILLING_MASK_INNER_MVE(CCCN888_TRGT_LOAD_STRIDE, vStride4Offs,
CCCN888_SCAL_OPACITY, vOpacity, 4, 254);
#else
register unsigned blkCnt __asm("lr");
blkCnt = iWidth;
__asm volatile(
"vecAlphaCompl .req q2 \n"
".p2align 2 \n"
/* expand chan0 */
" vldrb.u16 q0, [%[pTargetCh0], %[str4Offs]] \n"
" vldrb.u16 q1, [%[pAlpha], %[str4Offs]] \n"
" vmulh.u8 q1, q1, %[vOpacity] \n"
" wlstp.16 lr, %[loopCnt], 1f \n"
"2: \n"
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
/* if vOpacity == 254, boost to 256 */
" vpt.i16 eq, q1, %[opa254] \n"
" vmovt.i16 q1, #256 \n"
#endif
" vsub.i16 vecAlphaCompl, %[vec256], q1 \n"
/* scale ch0 vector with alpha vector */
" vmul.u16 q3, q0, vecAlphaCompl \n"
/* expand chan1 */
" vldrb.u16 q0, [%[pTargetCh1], %[str4Offs]] \n"
/* blend ch0 vector with input ch0 color*/
" vmla.s16 q3, q1, %[c0] \n"
" vshr.u16 q3, q3, #8 \n"
" vstrb.u16 q3, [%[pTargetCh0], %[str4Offs]] \n"
/* scale ch1 vector with alpha vector */
" vmul.u16 q3, q0, vecAlphaCompl \n"
/* expand chan2 */
" vldrb.u16 q0, [%[pTargetCh2], %[str4Offs]] \n"
/* blend ch1 vector with input ch1 color*/
" vmla.s16 q3, q1, %[c1] \n"
" vshr.u16 q3, q3, #8 \n"
" vstrb.u16 q3, [%[pTargetCh1], %[str4Offs]] \n"
" adds %[pAlpha], #32 \n"
" adds %[pTargetCh0], #32 \n"
/* scale ch2 vector with alpha vector */
" vmul.u16 q3, q0, vecAlphaCompl \n"
" vldrb.u16 q0, [%[pTargetCh0], %[str4Offs]] \n"
/* blend ch2 vector with input ch2 color*/
" vmla.s16 q3, q1, %[c2] \n"
" vldrb.u16 q1, [%[pAlpha], %[str4Offs]] \n"
" vmulh.u8 q1, q1, %[vOpacity] \n"
" vshr.u16 q3, q3, #8 \n"
" vstrb.u16 q3, [%[pTargetCh2], %[str4Offs]] \n"
" adds %[pTargetCh1], #32 \n"
" adds %[pTargetCh2], #32 \n"
" letp lr, 2b \n"
"1: \n"
:[pTargetCh0] "+r"(pTargetCh0), [pTargetCh1] "+r"(pTargetCh1),
[pTargetCh2] "+r"(pTargetCh2), [pAlpha] "+r" (pAlpha), [loopCnt] "+r"(blkCnt)
:[vec256] "t" (v256),[str4Offs] "t" (vStride4Offs), [vOpacity] "t"(vOpacity),
[c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2)
#if !defined(__ARM_2D_CFG_UNSAFE_IGNORE_ALPHA_255_COMPENSATION__)
,[opa254] "r" (254)
#endif
:"q0", "q1", "q2", "q3", "memory", "cc");
#endif
pwAlpha += (iAlphaStride);
pTarget += (iTargetStride);
}
}
/* use macro expansion of fill/copy with masking */
#define __API_CAFWM_COLOUR gray8
#include "__arm_2d_alpha_mask_helium.inc"
#define __API_CAFWM_CFG_1_HORIZONTAL_LINE 1
#define __API_CAFWM_COLOUR gray8
#include "__arm_2d_alpha_mask_helium.inc"
#define __API_CAFWM_CFG_CHANNEL_8in32_SUPPORT 1
#define __API_CAFWM_COLOUR gray8
#include "__arm_2d_alpha_mask_helium.inc"
#define __API_CAFWM_COLOUR rgb565
#include "__arm_2d_alpha_mask_helium.inc"
#define __API_CAFWM_CFG_1_HORIZONTAL_LINE 1
#define __API_CAFWM_COLOUR rgb565
#include "__arm_2d_alpha_mask_helium.inc"
#define __API_CAFWM_CFG_CHANNEL_8in32_SUPPORT 1
#define __API_CAFWM_COLOUR rgb565
#include "__arm_2d_alpha_mask_helium.inc"
#define __API_CAFWM_COLOUR cccn888
#include "__arm_2d_alpha_mask_helium.inc"
#define __API_CAFWM_CFG_1_HORIZONTAL_LINE 1
#define __API_CAFWM_COLOUR cccn888
#include "__arm_2d_alpha_mask_helium.inc"
#define __API_CAFWM_CFG_CHANNEL_8in32_SUPPORT 1
#define __API_CAFWM_COLOUR cccn888
#include "__arm_2d_alpha_mask_helium.inc"
/*----------------------------------------------------------------------------*
* Assembly Patches *
*----------------------------------------------------------------------------*/
#if defined(__IS_COMPILER_GCC__) && __IS_COMPILER_GCC__
__OVERRIDE_WEAK
void ARM_2D_WRAP_FUNC( __MVE_WRAPPER( __arm_2d_impl_rgb565_masks_fill))(
uint16_t * __RESTRICT ptSourceBase,
int16_t iSourceStride,
arm_2d_size_t * __RESTRICT ptSourceSize,
uint8_t * __RESTRICT pchSourceMaskBase,
int16_t iSourceMaskStride,
arm_2d_size_t * __RESTRICT ptSourceMaskSize,
uint16_t * __RESTRICT ptTargetBase,
int16_t iTargetStride,
arm_2d_size_t * __RESTRICT ptTargetSize,
uint8_t * __RESTRICT pchTargetMaskBase,
int16_t iTargetMaskStride,
arm_2d_size_t * __RESTRICT ptTargetMaskSize)
{
uint8_t *__RESTRICT pchTargetMaskLineBase = pchTargetMaskBase;
uint16x8_t v256 = vdupq_n_u16(256);
#ifndef USE_MVE_INTRINSICS
uint16x8_t scratch[5];
/* vector of 256 avoiding use of vdup to increase overlap efficiency*/
vst1q((uint16_t *) & scratch[0], v256);
/* scratch[1] is temporary for blended Red chan. vector */
/* Unpacking Mask Red */
vst1q((uint16_t *) & scratch[2], vdupq_n_u16(0x00fc));
/* B channel packing mask */
vst1q((uint16_t *) & scratch[3], vdupq_n_u16(0xf800));
/* G channel packing Mask */
vst1q((uint16_t *) & scratch[4], vdupq_n_u16(0x07e0));
/* use of fixed point mult instead of vshr to increase overlap efficiency */
const int16_t inv_2pow3 = 1 << (15 - 3); /* 1/(2^3) in Q.15 */
#endif
for (int_fast16_t iTargetY = 0; iTargetY < ptTargetSize->iHeight;) {
uint16_t *__RESTRICT ptSource = ptSourceBase;
uint8_t *pchSourceMask = pchSourceMaskBase;
#if __API_CAFWM_CFG_SUPPORT_SRC_MSK_WRAPING
int_fast16_t iSourceMaskY = 0;
#endif
for (int_fast16_t iSourceY = 0; iSourceY < ptSourceSize->iHeight; iSourceY++) {
uint16_t *__RESTRICT ptTarget = ptTargetBase;
uint8_t *__RESTRICT pchTargetMask = pchTargetMaskLineBase;
uint_fast32_t wLengthLeft = ptTargetSize->iWidth;
do {
uint_fast32_t wLength = MIN(wLengthLeft, ptSourceSize->iWidth);
uint16_t *__RESTRICT ptSrc = ptSource;
uint8_t *__RESTRICT pchSrcMsk = pchSourceMask;
uint16_t *__RESTRICT ptTargetCur = ptTarget;
uint8_t *__RESTRICT pchTargetMaskCur = pchTargetMask;
#ifdef USE_MVE_INTRINSICS
int32_t blkCnt = wLength;
do {
uint16x8_t vecTarget = vld1q(ptTargetCur);
uint16x8_t vecSource = vld1q(ptSrc);
uint16x8_t vecSrcMsk = vldrbq_u16(pchSrcMsk);
uint16x8_t vecTargetMask = vldrbq_u16(pchTargetMaskCur);
uint16x8_t vecHwOpacity =
vsubq_u16(v256, (vecSrcMsk * vecTargetMask) >> 8);
vecTarget = __arm_2d_rgb565_blending_opacity_single_vec(
vecTarget, vecSource, vecHwOpacity);
/* tail predication */
vst1q_p_u16(ptTargetCur, vecTarget, vctp16q(blkCnt));
pchSrcMsk += 8;
pchTargetMaskCur += 8;
ptTargetCur += 8;
ptSrc += 8;
blkCnt -= 8;
}
while (blkCnt > 0);
#else
register unsigned blkCnt __asm("lr");
blkCnt = wLength;
__asm volatile (
/* R & B mask */
"vecRBUnpackMask .req q7 \n"
"vecAlpha .req q5 \n"
"vecHwOpacity .req q3 \n"
/* preload */
" vldrb.u16 q0, [%[pchSrcMsk]], #8 \n"
" vmov.i16 vecRBUnpackMask, #0x00f8 \n"
" vldrb.u16 q5, [%[pchTargetMask]], #8 \n"
".p2align 2 \n"
" wlstp.16 lr, %[loopCnt], 1f \n"
"2: \n"
/* vecSrcMsk * vecTargetMask */
" vmul.i16 q0, q5, q0 \n"
" vldrh.u16 q6, [%[ptTarget]] \n"
" vshr.u16 vecAlpha, q0, #8 \n"
/* 256-dup vector */
" vldrh.u16 q1, [%[scratch], #(16*0)] \n"
/* vecHwOpacity =
vsubq_u16(v256, (vecSrcMsk * vecTargetMask) >> 8) */
" vsub.i16 vecHwOpacity, q1, vecAlpha \n"
" vldrh.u16 q1, [%[ptSrc]], #16 \n"
/* mimic vshl #3 */
" vshl.u16 q0, q6, #3 \n"
" vmul.i16 q4, q1, %[eight] \n"
/* vecR extract and scale */
" vand q0, q0, vecRBUnpackMask \n"
" vmul.i16 q0, vecHwOpacity, q0 \n"
/* vecSrcR extract and scale */
" vand q4, q4, vecRBUnpackMask \n"
" vmul.i16 q4, vecAlpha, q4 \n"
/* 0xfc G-mask */
" vldrw.u32 q2, [%[scratch], #(16*2)] \n"
" vadd.i16 q4, q0, q4 \n"
/* push blended R */
" vstrw.32 q4, [%[scratch], #(16*1)] \n"
/* mimic vshr.u16 q4, q6, #3 */
" vqdmulh.s16 q4, q6, %[inv_2pow3] \n"
" vshr.u16 q0, q1, #3 \n"
/* vecG extract and scale */
" vand q4, q4, q2 \n"
" vmul.i16 q4, vecHwOpacity, q4 \n"
/* vecSrcG extract and scale */
" vand q0, q0, q2 \n"
" vmul.i16 q2, vecAlpha, q0 \n"
" vshr.u16 q0, q1, #8 \n"
/* blended G */
/* vadd.i16 q2, q4, q2
addition using vmla for more efficient overlap */
" vmla.s16 q2, q4, %[one] \n"
/* vecB extract and scale */
" vshr.u16 q4, q6, #8 \n"
" vand q4, q4, vecRBUnpackMask \n"
" vmul.i16 q4, vecHwOpacity, q4 \n"
/* vecSrcB extract and scale */
" vand q0, q0, vecRBUnpackMask \n"
" vmul.i16 q0, vecAlpha, q0 \n"
".unreq vecAlpha \n"
".unreq vecHwOpacity \n"
".unreq vecRBUnpackMask \n"
/* reload blended R */
" vldrw.u32 q1, [%[scratch], #(16*1)] \n"
/* blended B
vadd.i16 q0, q4, q0
addition using vmla for more efficient overlap */
" vmla.s16 q0, q4, %[one] \n"
/* pack R */
" vshr.u16 q3, q1, #11 \n"
/* B channel packing mask 0xf800 */
" vldrw.u32 q4, [%[scratch], #(16*3)] \n"
" vand q0, q0, q4 \n"
/* accumulate R & B */
" vorr q4, q0, q3 \n"
/* G channel packing mask 0x07e0 */
" vldrw.u32 q3, [%[scratch], #(16*4)] \n"
" vshr.u16 q2, q2, #5 \n"
/* load next source mask */
" vldrb.u16 q0, [%[pchSrcMsk]], #8 \n"
/* G channel masking */
" vand q2, q2, q3 \n"
/* load next target mask */
" vldrb.u16 q5, [%[pchTargetMask]], #8 \n"
/* pack G with R.B */
" vorr q4, q4, q2 \n"
" vstrh.16 q4, [%[ptTarget]], #16 \n"
" letp lr, 2b \n"
"1: \n"
:[ptTarget] "+r"(ptTargetCur),[ptSrc] "+r"(ptSrc),
[pchTargetMask] "+l"(pchTargetMaskCur),[pchSrcMsk] "+l"(pchSrcMsk),
[loopCnt] "+r"(blkCnt)
:[scratch] "r" (scratch),[eight] "r"(8),[inv_2pow3] "r"(inv_2pow3),
[one] "r" (1)
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory");
#endif
ptTarget += wLength;
pchTargetMask += wLength;
wLengthLeft -= wLength;
} while (wLengthLeft);
ptSource += iSourceStride;
ptTargetBase += iTargetStride;
#if __API_CAFWM_CFG_SUPPORT_SRC_MSK_WRAPING
iSourceMaskY++;
//! handle source mask
if ( (iSourceMaskY >= ptSourceMaskSize->iHeight)
|| (iSourceMaskY >= ptSourceSize->iHeight)) {
pchSourceMask = pchSourceMaskBase;
iSourceMaskY = 0;
} else {
pchSourceMask += iSourceMaskStride;
}
#else
pchSourceMask += iSourceMaskStride;
#endif
pchTargetMaskLineBase += iTargetMaskStride;
iTargetY++;
if (iTargetY >= ptTargetSize->iHeight) {
break;
}
}
}
}
__OVERRIDE_WEAK
void ARM_2D_WRAP_FUNC( __MVE_WRAPPER( __arm_2d_impl_rgb565_src_msk_1h_des_msk_fill))(
uint16_t * __RESTRICT ptSourceBase,
int16_t iSourceStride,
arm_2d_size_t * __RESTRICT ptSourceSize,
uint8_t * __RESTRICT pchSourceMaskBase,
int16_t iSourceMaskStride,
arm_2d_size_t * __RESTRICT ptSourceMaskSize,
uint16_t * __RESTRICT ptTargetBase,
int16_t iTargetStride,
arm_2d_size_t * __RESTRICT ptTargetSize,
uint8_t * __RESTRICT pchTargetMaskBase,
int16_t iTargetMaskStride,
arm_2d_size_t * __RESTRICT ptTargetMaskSize)
{
uint8_t *__RESTRICT pchTargetMaskLineBase = pchTargetMaskBase;
uint16x8_t v256 = vdupq_n_u16(256);
#ifndef USE_MVE_INTRINSICS
uint16x8_t scratch[5];
/* vector of 256 avoiding use of vdup to increase overlap efficiency*/
vst1q((uint16_t *) & scratch[0], v256);
/* scratch[1] is temporary for blended Red chan. vector */
/* Unpacking Mask Red */
vst1q((uint16_t *) & scratch[2], vdupq_n_u16(0x00fc));
/* B channel packing mask */
vst1q((uint16_t *) & scratch[3], vdupq_n_u16(0xf800));
/* G channel packing Mask */
vst1q((uint16_t *) & scratch[4], vdupq_n_u16(0x07e0));
/* use of fixed point mult instead of vshr to increase overlap efficiency */
const int16_t inv_2pow3 = 1 << (15 - 3); /* 1/(2^3) in Q.15 */
#endif
for (int_fast16_t iTargetY = 0; iTargetY < ptTargetSize->iHeight;) {
uint16_t *__RESTRICT ptSource = ptSourceBase;
uint8_t *pchSourceMask = pchSourceMaskBase;
#if __API_CAFWM_CFG_SUPPORT_SRC_MSK_WRAPING
int_fast16_t iSourceMaskY = 0;
#endif
for (int_fast16_t iSourceY = 0; iSourceY < ptSourceSize->iHeight; iSourceY++) {
uint16_t *__RESTRICT ptTarget = ptTargetBase;
uint8_t *__RESTRICT pchTargetMask = pchTargetMaskLineBase;
uint_fast32_t wLengthLeft = ptTargetSize->iWidth;
do {
uint_fast32_t wLength = MIN(wLengthLeft, ptSourceSize->iWidth);
uint16_t *__RESTRICT ptSrc = ptSource;
uint8_t *__RESTRICT pchSrcMsk = pchSourceMask;
uint16_t *__RESTRICT ptTargetCur = ptTarget;
uint8_t *__RESTRICT pchTargetMaskCur = pchTargetMask;
#ifdef USE_MVE_INTRINSICS
int32_t blkCnt = wLength;
do {
uint16x8_t vecTarget = vld1q(ptTargetCur);
uint16x8_t vecSource = vld1q(ptSrc);
uint16x8_t vecSrcMsk = vldrbq_u16(pchSrcMsk);
uint16x8_t vecTargetMask = vldrbq_u16(pchTargetMaskCur);
uint16x8_t vecHwOpacity =
vsubq_u16(v256, (vecSrcMsk * vecTargetMask) >> 8);
vecTarget = __arm_2d_rgb565_blending_opacity_single_vec(
vecTarget, vecSource, vecHwOpacity);
/* tail predication */
vst1q_p_u16(ptTargetCur, vecTarget, vctp16q(blkCnt));
pchSrcMsk += 8;
pchTargetMaskCur += 8;
ptTargetCur += 8;
ptSrc += 8;
blkCnt -= 8;
}
while (blkCnt > 0);
#else
register unsigned blkCnt __asm("lr");
blkCnt = wLength;
__asm volatile (
/* R & B mask */
"vecRBUnpackMask .req q7 \n"
"vecAlpha .req q5 \n"
"vecHwOpacity .req q3 \n"
/* preload */
" vldrb.u16 q0, [%[pchSrcMsk]], #8 \n"
" vmov.i16 vecRBUnpackMask, #0x00f8 \n"
" vldrb.u16 q5, [%[pchTargetMask]], #8 \n"
".p2align 2 \n"
" wlstp.16 lr, %[loopCnt], 1f \n"
"2: \n"
/* vecSrcMsk * vecTargetMask */
" vmul.i16 q0, q5, q0 \n"
" vldrh.u16 q6, [%[ptTarget]] \n"
" vshr.u16 vecAlpha, q0, #8 \n"
/* 256-dup vector */
" vldrh.u16 q1, [%[scratch], #(16*0)] \n"
/* vecHwOpacity =
vsubq_u16(v256, (vecSrcMsk * vecTargetMask) >> 8) */
" vsub.i16 vecHwOpacity, q1, vecAlpha \n"
" vldrh.u16 q1, [%[ptSrc]], #16 \n"
/* mimic vshl #3 */
" vshl.u16 q0, q6, #3 \n"
" vmul.i16 q4, q1, %[eight] \n"
/* vecR extract and scale */
" vand q0, q0, vecRBUnpackMask \n"
" vmul.i16 q0, vecHwOpacity, q0 \n"
/* vecSrcR extract and scale */
" vand q4, q4, vecRBUnpackMask \n"
" vmul.i16 q4, vecAlpha, q4 \n"
/* 0xfc G-mask */
" vldrw.u32 q2, [%[scratch], #(16*2)] \n"
" vadd.i16 q4, q0, q4 \n"
/* push blended R */
" vstrw.32 q4, [%[scratch], #(16*1)] \n"
/* mimic vshr.u16 q4, q6, #3 */
" vqdmulh.s16 q4, q6, %[inv_2pow3] \n"
" vshr.u16 q0, q1, #3 \n"
/* vecG extract and scale */
" vand q4, q4, q2 \n"
" vmul.i16 q4, vecHwOpacity, q4 \n"
/* vecSrcG extract and scale */
" vand q0, q0, q2 \n"
" vmul.i16 q2, vecAlpha, q0 \n"
" vshr.u16 q0, q1, #8 \n"
/* blended G */
/* vadd.i16 q2, q4, q2
addition using vmla for more efficient overlap */
" vmla.s16 q2, q4, %[one] \n"
/* vecB extract and scale */
" vshr.u16 q4, q6, #8 \n"
" vand q4, q4, vecRBUnpackMask \n"
" vmul.i16 q4, vecHwOpacity, q4 \n"
/* vecSrcB extract and scale */
" vand q0, q0, vecRBUnpackMask \n"
" vmul.i16 q0, vecAlpha, q0 \n"
".unreq vecAlpha \n"
".unreq vecHwOpacity \n"
".unreq vecRBUnpackMask \n"
/* reload blended R */
" vldrw.u32 q1, [%[scratch], #(16*1)] \n"
/* blended B
vadd.i16 q0, q4, q0
addition using vmla for more efficient overlap */
" vmla.s16 q0, q4, %[one] \n"
/* pack R */
" vshr.u16 q3, q1, #11 \n"
/* B channel packing mask 0xf800 */
" vldrw.u32 q4, [%[scratch], #(16*3)] \n"
" vand q0, q0, q4 \n"
/* accumulate R & B */
" vorr q4, q0, q3 \n"
/* G channel packing mask 0x07e0 */
" vldrw.u32 q3, [%[scratch], #(16*4)] \n"
" vshr.u16 q2, q2, #5 \n"
/* load next source mask */
" vldrb.u16 q0, [%[pchSrcMsk]], #8 \n"
/* G channel masking */
" vand q2, q2, q3 \n"
/* load next target mask */
" vldrb.u16 q5, [%[pchTargetMask]], #8 \n"
/* pack G with R.B */
" vorr q4, q4, q2 \n"
" vstrh.16 q4, [%[ptTarget]], #16 \n"
" letp lr, 2b \n"
"1: \n"
:[ptTarget] "+r"(ptTargetCur),[ptSrc] "+r"(ptSrc),
[pchTargetMask] "+l"(pchTargetMaskCur),[pchSrcMsk] "+l"(pchSrcMsk),
[loopCnt] "+r"(blkCnt)
:[scratch] "r" (scratch),[eight] "r"(8),[inv_2pow3] "r"(inv_2pow3),
[one] "r" (1)
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory");
#endif
ptTarget += wLength;
pchTargetMask += wLength;
wLengthLeft -= wLength;
} while (wLengthLeft);
ptSource += iSourceStride;
ptTargetBase += iTargetStride;
#if __API_CAFWM_CFG_SUPPORT_SRC_MSK_WRAPING
iSourceMaskY++;
//! handle source mask
if ( (iSourceMaskY >= ptSourceMaskSize->iHeight)
|| (iSourceMaskY >= ptSourceSize->iHeight)) {
pchSourceMask = pchSourceMaskBase;
iSourceMaskY = 0;
} else {
pchSourceMask += iSourceMaskStride;
}
#else
pchSourceMask += iSourceMaskStride;
#endif
pchTargetMaskLineBase = pchTargetMaskBase;
iTargetY++;
if (iTargetY >= ptTargetSize->iHeight) {
break;
}
}
}
}
#endif
#ifdef __cplusplus
}
#endif
#endif // __ARM_2D_HAS_HELIUM__