mirror of
https://gitee.com/Lyon1998/pikapython.git
synced 2025-01-22 17:12:55 +08:00
1026 lines
49 KiB
C++
1026 lines
49 KiB
C++
|
|
/*
|
|
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the License); you may
|
|
* not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
|
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
/* ----------------------------------------------------------------------
|
|
* Project: Arm-2D Library
|
|
* Title: __arm_2d_draw_pattern_helium.inc
|
|
* Description: Helium code template for drawing pattern
|
|
*
|
|
* $Date: 29. sep 2021
|
|
* $Revision: V.0.0.3
|
|
*
|
|
* -------------------------------------------------------------------- */
|
|
|
|
|
|
#ifndef __API_COLOUR
|
|
# error You have to define __API_COLOUR before using this c template
|
|
#endif
|
|
#ifndef __API_ELT_SZ
|
|
# error You have to define the __API_ELT_SZ before using this c template
|
|
#endif
|
|
|
|
#undef ____ARM_2D_FUNC
|
|
#undef ___ARM_2D_FUNC
|
|
#undef __ARM_2D_FUNC
|
|
#define ____ARM_2D_FUNC(__NAME, __COLOUR) __arm_2d_impl_##__COLOUR##_##__NAME
|
|
#define ___ARM_2D_FUNC(__NAME, __COLOUR) ____ARM_2D_FUNC(__NAME, __COLOUR)
|
|
#define __ARM_2D_FUNC(__NAME) ___ARM_2D_FUNC(__NAME, __API_COLOUR)
|
|
|
|
|
|
|
|
#if __API_ELT_SZ != 32
|
|
|
|
__OVERRIDE_WEAK
|
|
void __ARM_2D_FUNC(draw_pattern_fg_only)(uint8_t *__RESTRICT pchSourceBase,
|
|
int32_t iOffset,
|
|
int16_t iSourceStride,
|
|
ARM_PIX_SCLTYP(__API_ELT_SZ) *__RESTRICT pTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *__RESTRICT ptCopySize,
|
|
ARM_PIX_SCLTYP(__API_ELT_SZ) ForeColour)
|
|
{
|
|
//! get in byte offset
|
|
iOffset &= 0x07;
|
|
iSourceStride = (iSourceStride + 7) & ~0x07;
|
|
|
|
/* deduces offset vector from iOffset for gather loading */
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) offS =
|
|
vld1q(ARM_CONNECT2(__draw_pattern_src_incr_, __API_COLOUR) + iOffset);
|
|
/* deduces bitmask vector with wrap from iOffset */
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vBitMask =
|
|
vld1q(ARM_CONNECT2(__draw_pattern_src_bitmask_, __API_COLOUR) + iOffset);
|
|
|
|
|
|
if (ptCopySize->iWidth <= ARM_PIX_VECELT(__API_ELT_SZ)) {
|
|
/* small width specialization */
|
|
/* no inner loop */
|
|
mve_pred16_t p = ARM_CONNECT2(ARM_CONNECT2(vctp, __API_ELT_SZ), q)(ptCopySize->iWidth);
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
for (int32_t y = 0; y < ptCopySize->iHeight; y++) {
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vchSrc = vldrbq_gather_offset(pchSourceBase, offS);
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vTarg = vld1q(pTargetBase);
|
|
|
|
vchSrc = vchSrc & vBitMask;
|
|
vTarg = vdupq_m(vTarg, ForeColour, vcmpneq(vchSrc, 0));
|
|
vst1q_p(pTargetBase, vTarg, p);
|
|
|
|
pchSourceBase += (iSourceStride >> 3);
|
|
pTargetBase += iTargetStride;
|
|
}
|
|
#else
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vForeG = ARM_CONNECT2(vdupq_n_u, __API_ELT_SZ)(ForeColour);
|
|
|
|
__asm volatile(
|
|
".p2align 2 \n"
|
|
" wls lr, %[iHeight], 1f \n"
|
|
"2: \n"
|
|
/* widened vector load */
|
|
ARM_VLDWID_ASM(8,__API_ELT_SZ) \
|
|
" q0, [%[src], %[offS]] \n"
|
|
" vand q0, q0, %[bitmask] \n"
|
|
|
|
" vcmp.i" TO_STRING(__API_ELT_SZ) \
|
|
" eq, q0, zr \n"
|
|
/* contigous vector load */
|
|
ARM_VLD1_ASM(__API_ELT_SZ) \
|
|
" q0, [%[target]] \n"
|
|
" vpsel q0, q0,%[ForeG] \n"
|
|
" vmsr P0, %[p] \n"
|
|
" vpst \n"
|
|
/* predicated contigous vector store */
|
|
ARM_VST1P_ASM(__API_ELT_SZ) \
|
|
" q0,[%[target]] \n"
|
|
|
|
" add %[src], %[src], %[srcStride] \n"
|
|
" add %[target], %[target], %[targStride] \n"
|
|
|
|
" le lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [src] "+r"(pchSourceBase), [target] "+r" (pTargetBase)
|
|
: [bitmask] "t" (vBitMask),[ForeG] "t"(vForeG),
|
|
[srcStride] "r" (iSourceStride >> 3),
|
|
[targStride] "r" (iTargetStride * (__API_ELT_SZ/8)),
|
|
[iHeight] "r" (ptCopySize->iHeight),
|
|
[offS] "t" (offS), [p] "r" (p)
|
|
: "q0", "memory", "lr");
|
|
#endif
|
|
} else {
|
|
for (int32_t y = 0; y < ptCopySize->iHeight; y++) {
|
|
int32_t cnt = ptCopySize->iWidth;
|
|
uint8_t *pchSourceBaseCur = pchSourceBase;
|
|
ARM_PIX_SCLTYP(__API_ELT_SZ) *pTargetBaseCur = pTargetBase;
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
while (cnt > 0) {
|
|
mve_pred16_t p = ARM_CONNECT2(ARM_CONNECT2(vctp, __API_ELT_SZ), q)(cnt);
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vchSrc =
|
|
vldrbq_gather_offset_z(pchSourceBaseCur, offS, p);
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vTarg = vld1q_z(pTargetBaseCur, p);
|
|
|
|
vchSrc = vandq_x(vchSrc, vBitMask, p);
|
|
vTarg = vdupq_m(vTarg, ForeColour, vcmpneq_m(vchSrc, 0, p));
|
|
|
|
vst1q_p(pTargetBaseCur, vTarg, p);
|
|
|
|
pTargetBaseCur += ARM_PIX_VECELT(__API_ELT_SZ);
|
|
pchSourceBaseCur += ARM_PIX_VECELT(__API_ELT_SZ) / 8;
|
|
cnt -= ARM_PIX_VECELT(__API_ELT_SZ);
|
|
}
|
|
#else
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vForeG = ARM_CONNECT2(vdupq_n_u, __API_ELT_SZ)(ForeColour);
|
|
|
|
__asm volatile(
|
|
".p2align 2 \n"
|
|
" wlstp."TO_STRING(__API_ELT_SZ) " lr, %[cnt], 1f \n"
|
|
"2: \n"
|
|
/* widened vector load */
|
|
ARM_VLDWID_ASM(8,__API_ELT_SZ) \
|
|
" q0, [%[src], %[offS]] \n"
|
|
|
|
" vand q0, q0, %[bitmask] \n"
|
|
" add %[src], %[src], %[incrSrc] \n"
|
|
|
|
" vpt.i" TO_STRING(__API_ELT_SZ) \
|
|
" ne, q0, zr \n"
|
|
|
|
/* predicated contigous vector store */
|
|
ARM_VST1P_ASM(__API_ELT_SZ) \
|
|
" %[ForeG],[%[target]], #16 \n"
|
|
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [src] "+r"(pchSourceBaseCur), [target] "+r" (pTargetBaseCur)
|
|
: [bitmask] "t" (vBitMask),[ForeG] "t"(vForeG),
|
|
[cnt] "r" (cnt), [incrSrc] "i" (ARM_PIX_VECELT(__API_ELT_SZ) / 8),
|
|
[offS] "t" (offS)
|
|
: "q0", "memory", "lr");
|
|
#endif
|
|
pchSourceBase += (iSourceStride >> 3);
|
|
pTargetBase += iTargetStride;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __ARM_2D_FUNC(draw_pattern_no_bg_comp)(
|
|
uint8_t *__RESTRICT pchSourceBase,
|
|
int32_t iOffset,
|
|
int16_t iSourceStride,
|
|
ARM_PIX_SCLTYP(__API_ELT_SZ) *__RESTRICT pTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *__RESTRICT ptCopySize)
|
|
{
|
|
//! get in byte offset
|
|
iOffset &= 0x07;
|
|
iSourceStride = (iSourceStride + 7) & ~0x07;
|
|
|
|
/* deduces offset vector from iOffset for gather loading */
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) offS =
|
|
vld1q(ARM_CONNECT2(__draw_pattern_src_incr_, __API_COLOUR) + iOffset);
|
|
/* deduces bitmask vector with wrap from iOffset */
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vBitMask =
|
|
vld1q(ARM_CONNECT2(__draw_pattern_src_bitmask_, __API_COLOUR) + iOffset);
|
|
|
|
if (ptCopySize->iWidth <= ARM_PIX_VECELT(__API_ELT_SZ)) {
|
|
/* small width specialization */
|
|
/* no inner loop */
|
|
mve_pred16_t p = ARM_CONNECT2(ARM_CONNECT2(vctp, __API_ELT_SZ), q)(ptCopySize->iWidth);
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
for (int32_t y = 0; y < ptCopySize->iHeight; y++) {
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vchSrc = vldrbq_gather_offset(pchSourceBase, offS);
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vTarg = vld1q(pTargetBase);
|
|
|
|
vchSrc = vchSrc & vBitMask;
|
|
vTarg = vpselq(~vTarg, vTarg, vcmpneq(vchSrc, 0));
|
|
|
|
vst1q_p(pTargetBase, vTarg, p);
|
|
|
|
pchSourceBase += (iSourceStride >> 3);
|
|
pTargetBase += iTargetStride;
|
|
}
|
|
#else
|
|
|
|
__asm volatile(
|
|
".p2align 2 \n"
|
|
" wls lr, %[iHeight], 1f \n"
|
|
"2: \n"
|
|
/* widened vector load */
|
|
ARM_VLDWID_ASM(8,__API_ELT_SZ) \
|
|
" q0, [%[src], %[offS]] \n"
|
|
" vand q0, q0, %[bitmask] \n"
|
|
|
|
ARM_VLD1_ASM(__API_ELT_SZ) \
|
|
" q1, [%[target]] \n"
|
|
|
|
" vpt.i" TO_STRING(__API_ELT_SZ) \
|
|
" ne, q0, zr \n"
|
|
" vmvnt q1, q1 \n"
|
|
|
|
" vmsr P0, %[p] \n"
|
|
" vpst \n"
|
|
/* predicated contigous vector store */
|
|
ARM_VST1P_ASM(__API_ELT_SZ) \
|
|
" q1,[%[target]] \n"
|
|
|
|
" add %[src], %[src], %[srcStride] \n"
|
|
" add %[target], %[target], %[targStride] \n"
|
|
|
|
" le lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [src] "+r"(pchSourceBase), [target] "+r" (pTargetBase)
|
|
: [bitmask] "t" (vBitMask),
|
|
[srcStride] "r" (iSourceStride >> 3),
|
|
[targStride] "r" (iTargetStride*(__API_ELT_SZ/8)),
|
|
[iHeight] "r" (ptCopySize->iHeight),
|
|
[offS] "t" (offS), [p] "r" (p)
|
|
: "q0", "q1", "memory", "lr");
|
|
#endif
|
|
|
|
|
|
} else {
|
|
|
|
for (int32_t y = 0; y < ptCopySize->iHeight; y++) {
|
|
int32_t cnt = ptCopySize->iWidth;
|
|
uint8_t *pchSourceBaseCur = pchSourceBase;
|
|
ARM_PIX_SCLTYP(__API_ELT_SZ) *pTargetBaseCur = pTargetBase;
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
while (cnt > 0) {
|
|
mve_pred16_t p = ARM_CONNECT2(ARM_CONNECT2(vctp, __API_ELT_SZ), q)(cnt);
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vchSrc =
|
|
vldrbq_gather_offset_z(pchSourceBaseCur, offS, p);
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vTarg = vld1q_z(pTargetBaseCur, p);
|
|
|
|
vchSrc = vandq_x(vchSrc, vBitMask, p);
|
|
vTarg = vpselq(vmvnq_x(vTarg, p), vTarg, vcmpneq_m(vchSrc, 0, p));
|
|
|
|
vst1q_p(pTargetBaseCur, vTarg, p);
|
|
|
|
pTargetBaseCur += ARM_PIX_VECELT(__API_ELT_SZ);
|
|
pchSourceBaseCur += ARM_PIX_VECELT(__API_ELT_SZ) / 8;
|
|
cnt -= ARM_PIX_VECELT(__API_ELT_SZ);
|
|
}
|
|
#else
|
|
|
|
__asm volatile(
|
|
".p2align 2 \n"
|
|
" wlstp."TO_STRING(__API_ELT_SZ) \
|
|
" lr, %[cnt], 1f \n"
|
|
"2: \n"
|
|
/* widened vector load */
|
|
ARM_VLDWID_ASM(8,__API_ELT_SZ) \
|
|
" q0, [%[src], %[offS]] \n"
|
|
|
|
" vand q0, q0, %[bitmask] \n"
|
|
" add %[src], %[src], %[incrSrc] \n"
|
|
|
|
ARM_VLD1_ASM(__API_ELT_SZ) \
|
|
" q1, [%[target]] \n"
|
|
|
|
" vpt.i" TO_STRING(__API_ELT_SZ) \
|
|
" ne, q0, zr \n"
|
|
" vmvnt q1, q1 \n"
|
|
/* contigous vector store */
|
|
ARM_VST1_ASM(__API_ELT_SZ) \
|
|
" q1 ,[%[target]], #16 \n"
|
|
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [src] "+r"(pchSourceBaseCur), [target] "+r" (pTargetBaseCur)
|
|
: [bitmask] "t" (vBitMask),
|
|
[cnt] "r" (cnt), [incrSrc] "i" (ARM_PIX_VECELT(__API_ELT_SZ) / 8),
|
|
[offS] "t" (offS)
|
|
: "q0", "q1", "memory", "lr");
|
|
#endif
|
|
|
|
pchSourceBase += (iSourceStride >> 3);
|
|
pTargetBase += iTargetStride;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __ARM_2D_FUNC(draw_pattern_bg_only)( uint8_t *__RESTRICT pchSourceBase,
|
|
int32_t iOffset,
|
|
int16_t iSourceStride,
|
|
ARM_PIX_SCLTYP(__API_ELT_SZ) *__RESTRICT pTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *__RESTRICT ptCopySize,
|
|
ARM_PIX_SCLTYP(__API_ELT_SZ) BackColour)
|
|
{
|
|
//! get in byte offset
|
|
iOffset &= 0x07;
|
|
iSourceStride = (iSourceStride + 7) & ~0x07;
|
|
|
|
/* deduces offset vector from iOffset for gather loading */
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) offS =
|
|
vld1q(ARM_CONNECT2(__draw_pattern_src_incr_, __API_COLOUR) + iOffset);
|
|
/* deduces bitmask vector with wrap from iOffset */
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vBitMask =
|
|
vld1q(ARM_CONNECT2(__draw_pattern_src_bitmask_, __API_COLOUR) + iOffset);
|
|
|
|
if (ptCopySize->iWidth <= ARM_PIX_VECELT(__API_ELT_SZ)) {
|
|
/* small width specialization */
|
|
/* no inner loop */
|
|
mve_pred16_t p = ARM_CONNECT2(ARM_CONNECT2(vctp, __API_ELT_SZ), q)(ptCopySize->iWidth);
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
for (int32_t y = 0; y < ptCopySize->iHeight; y++) {
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vchSrc = vldrbq_gather_offset(pchSourceBase, offS);
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vTarg = vld1q(pTargetBase);
|
|
|
|
vchSrc = vchSrc & vBitMask;
|
|
vTarg = vdupq_m(vTarg, BackColour, vcmpeqq(vchSrc, 0));
|
|
|
|
vst1q_p(pTargetBase, vTarg, p);
|
|
|
|
pchSourceBase += (iSourceStride >> 3);
|
|
pTargetBase += iTargetStride;
|
|
}
|
|
#else
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vBackG = ARM_CONNECT2(vdupq_n_u, __API_ELT_SZ)(BackColour);
|
|
|
|
__asm volatile(
|
|
".p2align 2 \n"
|
|
" wls lr, %[iHeight], 1f \n"
|
|
"2: \n"
|
|
/* widened vector load */
|
|
ARM_VLDWID_ASM(8,__API_ELT_SZ) \
|
|
" q0, [%[src], %[offS]] \n"
|
|
" vand q0, q0, %[bitmask] \n"
|
|
|
|
" vcmp.i" TO_STRING(__API_ELT_SZ) \
|
|
" eq, q0, zr \n"
|
|
/* contigous vector load */
|
|
ARM_VLD1_ASM(__API_ELT_SZ) \
|
|
" q0, [%[target]] \n"
|
|
" vpsel q0, q0,%[vBackG] \n"
|
|
" vmsr P0, %[p] \n"
|
|
" vpst \n"
|
|
/* predicated contigous vector store */
|
|
ARM_VST1P_ASM(__API_ELT_SZ) \
|
|
" q0,[%[target]] \n"
|
|
|
|
" add %[src], %[src], %[srcStride] \n"
|
|
" add %[target], %[target], %[targStride] \n"
|
|
|
|
" le lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [src] "+r"(pchSourceBase), [target] "+r" (pTargetBase)
|
|
: [bitmask] "t" (vBitMask),[vBackG] "t"(vBackG),
|
|
[srcStride] "r" (iSourceStride >> 3),
|
|
[targStride] "r" (iTargetStride*(__API_ELT_SZ/8)),
|
|
[iHeight] "r" (ptCopySize->iHeight),
|
|
[offS] "t" (offS), [p] "r" (p)
|
|
: "q0", "memory", "lr");
|
|
#endif
|
|
|
|
|
|
} else {
|
|
for (int32_t y = 0; y < ptCopySize->iHeight; y++) {
|
|
int32_t cnt = ptCopySize->iWidth;
|
|
uint8_t *pchSourceBaseCur = pchSourceBase;
|
|
ARM_PIX_SCLTYP(__API_ELT_SZ) *pTargetBaseCur = pTargetBase;
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
while (cnt > 0) {
|
|
mve_pred16_t p = ARM_CONNECT2(ARM_CONNECT2(vctp, __API_ELT_SZ), q)(cnt);
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vchSrc =
|
|
vldrbq_gather_offset_z(pchSourceBaseCur, offS, p);
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vTarg = vld1q_z(pTargetBaseCur, p);
|
|
|
|
vchSrc = vandq_x(vchSrc, vBitMask, p);
|
|
vTarg = vdupq_m(vTarg, BackColour, vcmpeqq_m(vchSrc, 0, p));
|
|
|
|
vst1q_p(pTargetBaseCur, vTarg, p);
|
|
|
|
pTargetBaseCur += ARM_PIX_VECELT(__API_ELT_SZ);
|
|
pchSourceBaseCur += ARM_PIX_VECELT(__API_ELT_SZ) / 8;
|
|
cnt -= ARM_PIX_VECELT(__API_ELT_SZ);
|
|
}
|
|
#else
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vBackG = ARM_CONNECT2(vdupq_n_u, __API_ELT_SZ)(BackColour);
|
|
|
|
__asm volatile(
|
|
".p2align 2 \n"
|
|
" wlstp."TO_STRING(__API_ELT_SZ) " lr, %[cnt], 1f \n"
|
|
"2: \n"
|
|
/* widened vector load */
|
|
ARM_VLDWID_ASM(8,__API_ELT_SZ) \
|
|
" q0, [%[src], %[offS]] \n"
|
|
|
|
" vand q0, q0, %[bitmask] \n"
|
|
" add %[src], %[src], %[incrSrc] \n"
|
|
|
|
" vpt.i" TO_STRING(__API_ELT_SZ) \
|
|
" ne, q0, zr \n"
|
|
|
|
/* predicated contigous vector store */
|
|
ARM_VST1P_ASM(__API_ELT_SZ) \
|
|
" %[vBackG],[%[target]], #16 \n"
|
|
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [src] "+r"(pchSourceBaseCur), [target] "+r" (pTargetBaseCur)
|
|
: [bitmask] "t" (vBitMask),[vBackG] "t"(vBackG),
|
|
[cnt] "r" (cnt), [incrSrc] "i" (ARM_PIX_VECELT(__API_ELT_SZ) / 8),
|
|
[offS] "t" (offS)
|
|
: "q0", "memory", "lr");
|
|
#endif
|
|
|
|
pchSourceBase += (iSourceStride >> 3);
|
|
pTargetBase += iTargetStride;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __ARM_2D_FUNC(draw_pattern_bg_fg)( uint8_t *__RESTRICT pchSourceBase,
|
|
int32_t iOffset,
|
|
int16_t iSourceStride,
|
|
ARM_PIX_SCLTYP(__API_ELT_SZ) *__RESTRICT pTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *__RESTRICT ptCopySize,
|
|
ARM_PIX_SCLTYP(__API_ELT_SZ) ForeColour,
|
|
ARM_PIX_SCLTYP(__API_ELT_SZ) BackColour)
|
|
{
|
|
//! get in byte offset
|
|
iOffset &= 0x07;
|
|
iSourceStride = (iSourceStride + 7) & ~0x07;
|
|
|
|
/* deduces offset vector from iOffset for gather loading */
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) offS =
|
|
vld1q(ARM_CONNECT2(__draw_pattern_src_incr_, __API_COLOUR) + iOffset);
|
|
/* deduces bitmask vector with wrap from iOffset */
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vBitMask =
|
|
vld1q(ARM_CONNECT2(__draw_pattern_src_bitmask_, __API_COLOUR) + iOffset);
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vFgColor =
|
|
ARM_CONNECT2(vdupq_n_u, __API_ELT_SZ)(ForeColour);
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vBgColor =
|
|
ARM_CONNECT2(vdupq_n_u, __API_ELT_SZ)(BackColour);
|
|
|
|
if (ptCopySize->iWidth <= ARM_PIX_VECELT(__API_ELT_SZ)) {
|
|
/* small width specialization */
|
|
/* no inner loop */
|
|
mve_pred16_t p = ARM_CONNECT2(ARM_CONNECT2(vctp, __API_ELT_SZ), q)(ptCopySize->iWidth);
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
for (int32_t y = 0; y < ptCopySize->iHeight; y++) {
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vchSrc = vldrbq_gather_offset(pchSourceBase, offS);
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vTarg;
|
|
|
|
vchSrc = vchSrc & vBitMask;
|
|
vTarg =
|
|
vpselq(vFgColor, vBgColor, vcmpneq(vchSrc, 0));
|
|
|
|
vst1q_p(pTargetBase, vTarg, p);
|
|
|
|
pchSourceBase += (iSourceStride >> 3);
|
|
pTargetBase += iTargetStride;
|
|
}
|
|
#else
|
|
|
|
__asm volatile(
|
|
".p2align 2 \n"
|
|
" wls lr, %[iHeight], 1f \n"
|
|
"2: \n"
|
|
/* widened vector load */
|
|
ARM_VLDWID_ASM(8,__API_ELT_SZ) \
|
|
" q0, [%[src], %[offS]] \n"
|
|
" vand q0, q0, %[bitmask] \n"
|
|
|
|
" vcmp.i" TO_STRING(__API_ELT_SZ) \
|
|
" eq, q0, zr \n"
|
|
" vpsel q0, %[vBackG], %[vForeG] \n"
|
|
" vmsr P0, %[p] \n"
|
|
" vpst \n"
|
|
/* predicated contigous vector store */
|
|
ARM_VST1P_ASM(__API_ELT_SZ) \
|
|
" q0,[%[target]] \n"
|
|
|
|
" add %[src], %[src], %[srcStride] \n"
|
|
" add %[target], %[target], %[targStride] \n"
|
|
|
|
" le lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [src] "+r"(pchSourceBase), [target] "+r" (pTargetBase)
|
|
: [bitmask] "t" (vBitMask),[vBackG] "t"(vBgColor),
|
|
[vForeG] "t" (vFgColor),
|
|
[srcStride] "r" (iSourceStride >> 3),
|
|
[targStride] "r" (iTargetStride*(__API_ELT_SZ/8)),
|
|
[iHeight] "r" (ptCopySize->iHeight),
|
|
[offS] "t" (offS), [p] "r" (p)
|
|
: "q0", "memory", "lr");
|
|
#endif
|
|
|
|
|
|
} else {
|
|
for (int32_t y = 0; y < ptCopySize->iHeight; y++) {
|
|
int32_t cnt = ptCopySize->iWidth;
|
|
uint8_t *pchSourceBaseCur = pchSourceBase;
|
|
ARM_PIX_SCLTYP(__API_ELT_SZ) *pTargetBaseCur = pTargetBase;
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
while (cnt > 0) {
|
|
mve_pred16_t p = ARM_CONNECT2(ARM_CONNECT2(vctp, __API_ELT_SZ), q)(cnt);
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vchSrc =
|
|
vldrbq_gather_offset_z(pchSourceBaseCur, offS, p);
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vTarg;
|
|
|
|
vchSrc = vandq_x(vchSrc, vBitMask, p);
|
|
vTarg =
|
|
vpselq(vFgColor, vBgColor, vcmpneq_m(vchSrc, 0, p));
|
|
|
|
vst1q_p(pTargetBaseCur, vTarg, p);
|
|
|
|
pTargetBaseCur += ARM_PIX_VECELT(__API_ELT_SZ);
|
|
pchSourceBaseCur += ARM_PIX_VECELT(__API_ELT_SZ) / 8;
|
|
cnt -= ARM_PIX_VECELT(__API_ELT_SZ);
|
|
}
|
|
#else
|
|
__asm volatile(
|
|
".p2align 2 \n"
|
|
" wlstp."TO_STRING(__API_ELT_SZ) " lr, %[cnt], 1f \n"
|
|
"2: \n"
|
|
/* widened vector load */
|
|
ARM_VLDWID_ASM(8,__API_ELT_SZ) \
|
|
" q0, [%[src], %[offS]] \n"
|
|
|
|
" vand q0, q0, %[bitmask] \n"
|
|
" add %[src], %[src], %[incrSrc] \n"
|
|
|
|
" vcmp.i" TO_STRING(__API_ELT_SZ) \
|
|
" eq, q0, zr \n"
|
|
" vpsel q0, %[vBackG], %[vForeG] \n"
|
|
|
|
ARM_VST1_ASM(__API_ELT_SZ) \
|
|
" q0, [%[target]], #16 \n"
|
|
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [src] "+r"(pchSourceBaseCur), [target] "+r" (pTargetBaseCur)
|
|
: [bitmask] "t" (vBitMask),[vBackG] "t"(vBgColor),
|
|
[vForeG] "t" (vFgColor),
|
|
[cnt] "r" (cnt), [incrSrc] "i" (ARM_PIX_VECELT(__API_ELT_SZ) / 8),
|
|
[offS] "t" (offS)
|
|
: "q0", "memory", "lr");
|
|
#endif
|
|
pchSourceBase += (iSourceStride >> 3);
|
|
pTargetBase += iTargetStride;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __ARM_2D_FUNC(draw_pattern_bg_comp)( uint8_t *__RESTRICT pchSourceBase,
|
|
int32_t iOffset,
|
|
int16_t iSourceStride,
|
|
ARM_PIX_SCLTYP(__API_ELT_SZ) *__RESTRICT pTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *__RESTRICT ptCopySize,
|
|
ARM_PIX_SCLTYP(__API_ELT_SZ) BackColour)
|
|
{
|
|
//! get in byte offset
|
|
iOffset &= 0x07;
|
|
iSourceStride = (iSourceStride + 7) & ~0x07;
|
|
|
|
/* deduces offset vector from iOffset for gather loading */
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) offS =
|
|
vld1q(ARM_CONNECT2(__draw_pattern_src_incr_, __API_COLOUR) + iOffset);
|
|
/* deduces bitmask vector with wrap from iOffset */
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vBitMask =
|
|
vld1q(ARM_CONNECT2(__draw_pattern_src_bitmask_, __API_COLOUR) + iOffset);
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vBgColor =
|
|
ARM_CONNECT2(vdupq_n_u, __API_ELT_SZ)(BackColour);
|
|
|
|
if (ptCopySize->iWidth <= ARM_PIX_VECELT(__API_ELT_SZ)) {
|
|
/* small width specialization */
|
|
/* no inner loop */
|
|
mve_pred16_t p = ARM_CONNECT2(ARM_CONNECT2(vctp, __API_ELT_SZ), q)(ptCopySize->iWidth);
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
for (int32_t y = 0; y < ptCopySize->iHeight; y++) {
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vchSrc = vldrbq_gather_offset(pchSourceBase, offS);
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vTarg = vld1q(pTargetBase);
|
|
|
|
vchSrc = vchSrc & vBitMask;
|
|
vTarg = vpselq(vmvnq(vTarg), vBgColor, vcmpneq(vchSrc, 0));
|
|
|
|
vst1q_p(pTargetBase, vTarg, p);
|
|
|
|
pchSourceBase += (iSourceStride >> 3);
|
|
pTargetBase += iTargetStride;
|
|
}
|
|
#else
|
|
__asm volatile(
|
|
".p2align 2 \n"
|
|
" wls lr, %[iHeight], 1f \n"
|
|
"2: \n"
|
|
/* widened vector load */
|
|
ARM_VLDWID_ASM(8,__API_ELT_SZ) \
|
|
" q0, [%[src], %[offS]] \n"
|
|
" vand q0, q0, %[bitmask] \n"
|
|
|
|
ARM_VLD1_ASM(__API_ELT_SZ) \
|
|
" q1, [%[target]] \n"
|
|
|
|
" vmvn q1, q1 \n"
|
|
|
|
" vcmp.i" TO_STRING(__API_ELT_SZ) \
|
|
" eq, q0, zr \n"
|
|
" vpsel q1, %[vBackG], q1 \n"
|
|
|
|
" vmsr P0, %[p] \n"
|
|
" vpst \n"
|
|
/* predicated contigous vector store */
|
|
ARM_VST1P_ASM(__API_ELT_SZ) \
|
|
" q1,[%[target]] \n"
|
|
|
|
" add %[src], %[src], %[srcStride] \n"
|
|
" add %[target], %[target], %[targStride] \n"
|
|
|
|
" le lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [src] "+r"(pchSourceBase), [target] "+r" (pTargetBase)
|
|
: [bitmask] "t" (vBitMask), [vBackG] "t" (vBgColor),
|
|
[srcStride] "r" (iSourceStride >> 3),
|
|
[targStride] "r" (iTargetStride*(__API_ELT_SZ/8)),
|
|
[iHeight] "r" (ptCopySize->iHeight),
|
|
[offS] "t" (offS), [p] "r" (p)
|
|
: "q0", "q1", "memory", "lr");
|
|
#endif
|
|
} else {
|
|
for (int32_t y = 0; y < ptCopySize->iHeight; y++) {
|
|
int32_t cnt = ptCopySize->iWidth;
|
|
uint8_t *pchSourceBaseCur = pchSourceBase;
|
|
ARM_PIX_SCLTYP(__API_ELT_SZ) *pTargetBaseCur = pTargetBase;
|
|
|
|
#ifdef USE_MVE_INTRINSICS
|
|
while (cnt > 0) {
|
|
mve_pred16_t p = ARM_CONNECT2(ARM_CONNECT2(vctp, __API_ELT_SZ), q)(cnt);
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vchSrc =
|
|
vldrbq_gather_offset_z(pchSourceBaseCur, offS, p);
|
|
ARM_PIX_VECTYP(__API_ELT_SZ) vTarg = vld1q_z(pTargetBaseCur, p);
|
|
|
|
vchSrc = vandq_x(vchSrc, vBitMask, p);
|
|
vTarg =
|
|
vpselq(vmvnq_x(vTarg, p), vBgColor, vcmpneq_m(vchSrc, 0, p));
|
|
|
|
vst1q_p(pTargetBaseCur, vTarg, p);
|
|
|
|
pTargetBaseCur += ARM_PIX_VECELT(__API_ELT_SZ);
|
|
pchSourceBaseCur += ARM_PIX_VECELT(__API_ELT_SZ) / 8;
|
|
cnt -= ARM_PIX_VECELT(__API_ELT_SZ);
|
|
}
|
|
#else
|
|
__asm volatile(
|
|
".p2align 2 \n"
|
|
" wlstp."TO_STRING(__API_ELT_SZ) " lr, %[cnt], 1f \n"
|
|
"2: \n"
|
|
/* widened vector load */
|
|
ARM_VLDWID_ASM(8,__API_ELT_SZ) \
|
|
" q0, [%[src], %[offS]] \n"
|
|
|
|
" vand q0, q0, %[bitmask] \n"
|
|
" add %[src], %[src], %[incrSrc] \n"
|
|
|
|
ARM_VLD1_ASM(__API_ELT_SZ) \
|
|
" q1, [%[target]] \n"
|
|
|
|
" vmvn q1, q1 \n"
|
|
|
|
" vcmp.i" TO_STRING(__API_ELT_SZ) \
|
|
" eq, q0, zr \n"
|
|
" vpsel q1, %[vBackG], q1 \n"
|
|
|
|
ARM_VST1_ASM(__API_ELT_SZ) \
|
|
" q1, [%[target]], #16 \n"
|
|
|
|
" letp lr, 2b \n"
|
|
"1: \n"
|
|
|
|
: [src] "+r"(pchSourceBaseCur), [target] "+r" (pTargetBaseCur)
|
|
: [bitmask] "t" (vBitMask), [vBackG] "t" (vBgColor),
|
|
[cnt] "r" (cnt), [incrSrc] "i" (ARM_PIX_VECELT(__API_ELT_SZ) / 8),
|
|
[offS] "t" (offS)
|
|
: "q0", "q1", "memory", "lr");
|
|
#endif
|
|
pchSourceBase += (iSourceStride >> 3);
|
|
pTargetBase += iTargetStride;
|
|
}
|
|
}
|
|
}
|
|
|
|
#else /* __API_ELT_SZ != 32 */
|
|
|
|
#define GENERIC_RGB32_DRAW_PATTERN(SELECTOR) \
|
|
if (ptCopySize->iWidth <= 4) { \
|
|
/* very tall width case */ \
|
|
/* only bottom parts of gather load and bitmask needed */ \
|
|
/* no inner loop */ \
|
|
for (int32_t y = 0; y < ptCopySize->iHeight; y++) { \
|
|
mve_pred16_t p = vctp32q(ptCopySize->iWidth); \
|
|
uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBase, offSLo); \
|
|
uint32x4_t vTarg = vld1q(pTargetBase); \
|
|
\
|
|
vchSrc = vandq(vchSrc, vBitMaskLo); \
|
|
vTarg = SELECTOR(vcmpneq_n_u32(vchSrc, 0)); \
|
|
\
|
|
vst1q_p(pTargetBase, vTarg, p); \
|
|
\
|
|
pchSourceBase += (iSourceStride >> 3); \
|
|
pTargetBase += iTargetStride; \
|
|
} \
|
|
} else if (ptCopySize->iWidth <= 8) { \
|
|
/* bottom and partial upper parts of gather load and bitmask needed */ \
|
|
/* no inner loop */ \
|
|
for (int32_t y = 0; y < ptCopySize->iHeight; y++) { \
|
|
mve_pred16_t p = vctp32q(ptCopySize->iWidth - 4); \
|
|
uint32x4_t vchSrc = vldrbq_gather_offset_u32(pchSourceBase, offSLo); \
|
|
uint32x4_t vTarg = vld1q(pTargetBase); \
|
|
\
|
|
vchSrc = vandq(vchSrc, vBitMaskLo); \
|
|
vTarg = SELECTOR(vcmpneq_n_u32(vchSrc, 0)); \
|
|
\
|
|
vst1q(pTargetBase, vTarg); \
|
|
\
|
|
vchSrc = vldrbq_gather_offset_u32(pchSourceBase, offSHi); \
|
|
vTarg = vld1q(pTargetBase + 4); \
|
|
\
|
|
vchSrc = vandq(vchSrc, vBitMaskHi); \
|
|
vTarg = SELECTOR(vcmpneq_n_u32(vchSrc, 0)); \
|
|
\
|
|
vst1q_p(pTargetBase + 4, vTarg, p); \
|
|
\
|
|
pchSourceBase += (iSourceStride >> 3); \
|
|
pTargetBase += iTargetStride; \
|
|
} \
|
|
} else { \
|
|
/* generic case */ \
|
|
for (int32_t y = 0; y < ptCopySize->iHeight; y++) { \
|
|
int32_t cnt = ptCopySize->iWidth; \
|
|
uint8_t *pchSourceBaseCur = pchSourceBase; \
|
|
uint32_t *pTargetBaseCur = pTargetBase; \
|
|
\
|
|
while (cnt >= 8) { \
|
|
uint32x4_t vchSrc = \
|
|
vldrbq_gather_offset_u32(pchSourceBaseCur, offSLo); \
|
|
uint32x4_t vTarg = vld1q(pTargetBaseCur); \
|
|
\
|
|
vchSrc = vandq(vchSrc, vBitMaskLo); \
|
|
vTarg = SELECTOR(vcmpneq_n_u32(vchSrc, 0)); \
|
|
\
|
|
vst1q(pTargetBaseCur, vTarg); \
|
|
pTargetBaseCur += 4; \
|
|
\
|
|
vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSHi); \
|
|
vTarg = vld1q(pTargetBaseCur); \
|
|
\
|
|
vchSrc = vandq(vchSrc, vBitMaskHi); \
|
|
vTarg = SELECTOR(vcmpneq_n_u32(vchSrc, 0)); \
|
|
\
|
|
vst1q(pTargetBaseCur, vTarg); \
|
|
pTargetBaseCur += 4; \
|
|
\
|
|
pchSourceBaseCur += 1; \
|
|
cnt -= 8; \
|
|
} \
|
|
\
|
|
/* tail */ \
|
|
if (cnt > 4) { \
|
|
/* bottom part + upper residual parts */ \
|
|
uint32x4_t vchSrc = \
|
|
vldrbq_gather_offset_u32(pchSourceBaseCur, offSLo); \
|
|
uint32x4_t vTarg = vld1q(pTargetBaseCur); \
|
|
cnt -= 4; \
|
|
mve_pred16_t p = vctp32q(cnt); \
|
|
\
|
|
vchSrc = vandq(vchSrc, vBitMaskLo); \
|
|
vTarg = SELECTOR(vcmpneq_n_u32(vchSrc, 0)); \
|
|
\
|
|
vst1q(pTargetBaseCur, vTarg); \
|
|
pTargetBaseCur += 4; \
|
|
\
|
|
vchSrc = vldrbq_gather_offset_u32(pchSourceBaseCur, offSHi); \
|
|
vTarg = vld1q(pTargetBaseCur); \
|
|
\
|
|
vchSrc = vandq(vchSrc, vBitMaskHi); \
|
|
vTarg = SELECTOR(vcmpneq_n_u32(vchSrc, 0)); \
|
|
\
|
|
vst1q_p(pTargetBaseCur, vTarg, p); \
|
|
} else if (cnt > 0) { \
|
|
/* bottom part residual */ \
|
|
uint32x4_t vchSrc = \
|
|
vldrbq_gather_offset_u32(pchSourceBaseCur, offSLo); \
|
|
uint32x4_t vTarg = vld1q(pTargetBaseCur); \
|
|
mve_pred16_t p = vctp32q(cnt); \
|
|
\
|
|
vchSrc = vandq(vchSrc, vBitMaskLo); \
|
|
vTarg = SELECTOR(vcmpneq_n_u32(vchSrc, 0)); \
|
|
\
|
|
vst1q_p(pTargetBaseCur, vTarg, p); \
|
|
} \
|
|
\
|
|
pchSourceBase += (iSourceStride >> 3); \
|
|
pTargetBase += iTargetStride; \
|
|
} \
|
|
}
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_rgb32_draw_pattern_fg_only(uint8_t *__RESTRICT pchSourceBase,
|
|
int32_t iOffset,
|
|
int16_t iSourceStride,
|
|
uint32_t *__RESTRICT pTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *__RESTRICT ptCopySize,
|
|
uint32_t hwForeColour)
|
|
{
|
|
//! get in byte offset
|
|
iOffset &= 0x07;
|
|
iSourceStride = (iSourceStride + 7) & ~0x07;
|
|
|
|
/* deduces offset vector from iOffset for gather loading */
|
|
/* hold 8 contiguous values into 2 32-bit vector pair */
|
|
uint32x4_t offSLo = vld1q(__draw_pattern_src_incr_rgb32 + iOffset);
|
|
uint32x4_t offSHi = vld1q(__draw_pattern_src_incr_rgb32 + iOffset + 4);
|
|
/* deduces bitmask vector with wrap from iOffset */
|
|
/* hold 8 contiguous values into 2 32-bit vector pair */
|
|
uint32x4_t vBitMaskLo = vld1q(__draw_pattern_src_bitmask_rgb32 + iOffset);
|
|
uint32x4_t vBitMaskHi = vld1q(__draw_pattern_src_bitmask_rgb32 + iOffset + 4);
|
|
|
|
#define SEL_FG_ONLY(p) vdupq_m_n_u32(vTarg, hwForeColour, p);
|
|
|
|
GENERIC_RGB32_DRAW_PATTERN(SEL_FG_ONLY);
|
|
}
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_rgb32_draw_pattern_no_bg_comp(uint8_t *__RESTRICT pchSourceBase,
|
|
int32_t iOffset,
|
|
int16_t iSourceStride,
|
|
uint32_t *__RESTRICT pTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *__RESTRICT ptCopySize)
|
|
{
|
|
//! get in byte offset
|
|
iOffset &= 0x07;
|
|
iSourceStride = (iSourceStride + 7) & ~0x07;
|
|
|
|
/* deduces offset vector from iOffset for gather loading */
|
|
/* hold 8 contiguous values into 2 32-bit vector pair */
|
|
uint32x4_t offSLo = vld1q(__draw_pattern_src_incr_rgb32 + iOffset);
|
|
uint32x4_t offSHi = vld1q(__draw_pattern_src_incr_rgb32 + iOffset + 4);
|
|
/* deduces bitmask vector with wrap from iOffset */
|
|
/* hold 8 contiguous values into 2 32-bit vector pair */
|
|
uint32x4_t vBitMaskLo = vld1q(__draw_pattern_src_bitmask_rgb32 + iOffset);
|
|
uint32x4_t vBitMaskHi = vld1q(__draw_pattern_src_bitmask_rgb32 + iOffset + 4);
|
|
|
|
#define SEL_NO_BG_COMP(p) vpselq(~vTarg, vTarg, p);
|
|
|
|
GENERIC_RGB32_DRAW_PATTERN(SEL_NO_BG_COMP);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_rgb32_draw_pattern_bg_only(uint8_t *__RESTRICT pchSourceBase,
|
|
int32_t iOffset,
|
|
int16_t iSourceStride,
|
|
uint32_t *__RESTRICT pTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *__RESTRICT ptCopySize,
|
|
uint32_t hwBackColour)
|
|
{
|
|
//! get in byte offset
|
|
iOffset &= 0x07;
|
|
iSourceStride = (iSourceStride + 7) & ~0x07;
|
|
|
|
/* deduces offset vector from iOffset for gather loading */
|
|
/* hold 8 contiguous values into 2 32-bit vector pair */
|
|
uint32x4_t offSLo = vld1q(__draw_pattern_src_incr_rgb32 + iOffset);
|
|
uint32x4_t offSHi = vld1q(__draw_pattern_src_incr_rgb32 + iOffset + 4);
|
|
/* deduces bitmask vector with wrap from iOffset */
|
|
/* hold 8 contiguous values into 2 32-bit vector pair */
|
|
uint32x4_t vBitMaskLo = vld1q(__draw_pattern_src_bitmask_rgb32 + iOffset);
|
|
uint32x4_t vBitMaskHi = vld1q(__draw_pattern_src_bitmask_rgb32 + iOffset + 4);
|
|
|
|
#define SEL_BG_ONLY(p) vdupq_m_n_u32(vTarg, hwBackColour, p);
|
|
|
|
GENERIC_RGB32_DRAW_PATTERN(SEL_BG_ONLY);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_rgb32_draw_pattern_bg_fg(uint8_t *__RESTRICT pchSourceBase,
|
|
int32_t iOffset,
|
|
int16_t iSourceStride,
|
|
uint32_t *__RESTRICT pTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *__RESTRICT ptCopySize,
|
|
uint32_t hwForeColour,
|
|
uint32_t hwBackColour)
|
|
{
|
|
//! get in byte offset
|
|
iOffset &= 0x07;
|
|
iSourceStride = (iSourceStride + 7) & ~0x07;
|
|
|
|
/* deduces offset vector from iOffset for gather loading */
|
|
/* hold 8 contiguous values into 2 32-bit vector pair */
|
|
uint32x4_t offSLo = vld1q(__draw_pattern_src_incr_rgb32 + iOffset);
|
|
uint32x4_t offSHi = vld1q(__draw_pattern_src_incr_rgb32 + iOffset + 4);
|
|
/* deduces bitmask vector with wrap from iOffset */
|
|
/* hold 8 contiguous values into 2 32-bit vector pair */
|
|
uint32x4_t vBitMaskLo = vld1q(__draw_pattern_src_bitmask_rgb32 + iOffset);
|
|
uint32x4_t vBitMaskHi = vld1q(__draw_pattern_src_bitmask_rgb32 + iOffset + 4);
|
|
uint32x4_t vFgColor = vdupq_n_u32(hwForeColour);
|
|
uint32x4_t vBgColor = vdupq_n_u32(hwBackColour);
|
|
|
|
#define SEL_BG_FG(p) vpselq(vFgColor, vBgColor, p);
|
|
|
|
GENERIC_RGB32_DRAW_PATTERN(SEL_BG_FG);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__OVERRIDE_WEAK
|
|
void __arm_2d_impl_rgb32_draw_pattern_bg_comp(uint8_t *__RESTRICT pchSourceBase,
|
|
int32_t iOffset,
|
|
int16_t iSourceStride,
|
|
uint32_t *__RESTRICT pTargetBase,
|
|
int16_t iTargetStride,
|
|
arm_2d_size_t *__RESTRICT ptCopySize,
|
|
uint32_t hwBackColour)
|
|
{
|
|
//! get in byte offset
|
|
iOffset &= 0x07;
|
|
iSourceStride = (iSourceStride + 7) & ~0x07;
|
|
|
|
/* deduces offset vector from iOffset for gather loading */
|
|
/* hold 8 contiguous values into 2 32-bit vector pair */
|
|
uint32x4_t offSLo = vld1q(__draw_pattern_src_incr_rgb32 + iOffset);
|
|
uint32x4_t offSHi = vld1q(__draw_pattern_src_incr_rgb32 + iOffset + 4);
|
|
/* deduces bitmask vector with wrap from iOffset */
|
|
/* hold 8 contiguous values into 2 32-bit vector pair */
|
|
uint32x4_t vBitMaskLo = vld1q(__draw_pattern_src_bitmask_rgb32 + iOffset);
|
|
uint32x4_t vBitMaskHi = vld1q(__draw_pattern_src_bitmask_rgb32 + iOffset + 4);
|
|
uint32x4_t vBgColor = vdupq_n_u32(hwBackColour);
|
|
|
|
#define SEL_BG_COMP(p) vpselq(vmvnq(vTarg), vBgColor, p);
|
|
|
|
GENERIC_RGB32_DRAW_PATTERN(SEL_BG_COMP);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
#undef ____ARM_2D_FUNC
|
|
#undef ___ARM_2D_FUNC
|
|
#undef __ARM_2D_FUNC
|
|
#undef __API_COLOUR
|
|
#undef __API_ELT_SZ
|