pikapython/package/Arm2D/arm_2d_helium.c

/*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/* ----------------------------------------------------------------------
 * Project:      Arm-2D Library
 * Title:        arm-2d_helium.c
 * Description:  Acceleration extensions using Helium.
 *
 * $Date:        22. Sep 2021
 * $Revision:    V.0.12.0
 *
 * Target Processor:  Cortex-M cores with Helium
 *
 * -------------------------------------------------------------------- */

#define __ARM_2D_IMPL__

#include "arm_2d.h"
#include "__arm_2d_impl.h"

#if defined(__ARM_2D_HAS_HELIUM__) && __ARM_2D_HAS_HELIUM__

#if defined(__clang__)
#   pragma clang diagnostic push
#   pragma clang diagnostic ignored "-Wunknown-warning-option"
#   pragma clang diagnostic ignored "-Wreserved-identifier"
#   pragma clang diagnostic ignored "-Wincompatible-pointer-types-discards-qualifiers"
#   pragma clang diagnostic ignored "-Wcast-qual"
#   pragma clang diagnostic ignored "-Wcast-align"
#   pragma clang diagnostic ignored "-Wextra-semi-stmt"
#   pragma clang diagnostic ignored "-Wsign-conversion"
#   pragma clang diagnostic ignored "-Wunused-function"
#   pragma clang diagnostic ignored "-Wimplicit-int-float-conversion"
#   pragma clang diagnostic ignored "-Wdouble-promotion"
#   pragma clang diagnostic ignored "-Wunused-parameter"
#   pragma clang diagnostic ignored "-Wimplicit-float-conversion"
#   pragma clang diagnostic ignored "-Wimplicit-int-conversion"
#   pragma clang diagnostic ignored "-Wtautological-pointer-compare"
#   pragma clang diagnostic ignored "-Wmissing-prototypes"
#   pragma clang diagnostic ignored "-Wsign-compare"
#   pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
#   pragma clang diagnostic ignored "-Wpadded"
#   pragma clang diagnostic ignored "-Wvector-conversion"
#   pragma clang diagnostic ignored "-Wundef"
#endif


#include "__arm_2d_paving_helium.h"
#include "__arm_2d_math_helium.h"
#include "__arm_2d_utils_helium.h"
#ifdef   __cplusplus
extern "C" {
#endif


/*! \brief initialise the helium service service
 *! \param none
 *! \return none
 */
void __arm_2d_helium_init(void)
{
    /* even if this is empty, do not remove it */
}


/*----------------------------------------------------------------------------*
 * Code Template for tile operations                                                             *
 *----------------------------------------------------------------------------*/

#define __API_COLOUR                c8bit
#define __API_INT_TYPE              uint8_t
#define __API_INT_TYPE_BIT_NUM      8

#include "__arm_2d_copy_helium.inc"

#define __API_COLOUR                rgb16
#define __API_INT_TYPE              uint16_t
#define __API_INT_TYPE_BIT_NUM      16

#include "__arm_2d_copy_helium.inc"


#define __API_COLOUR                rgb32
#define __API_INT_TYPE              uint32_t
#define __API_INT_TYPE_BIT_NUM      32

#include "__arm_2d_copy_helium.inc"


/*----------------------------------------------------------------------------*
 * Specialized Copy Routines                                                  *
 *----------------------------------------------------------------------------*/


static
void __arm_copy_16_mve_narrow(  uint16_t *phwSource,
                                                int16_t iSourceStride,
                                                uint16_t *phwTarget,
                                                int16_t iTargetStride,
                                                arm_2d_size_t *ptCopySize)
{
#ifdef USE_MVE_INTRINSICS
    for (int32_t x = 0; x < ptCopySize->iWidth; x++) {
        uint16x8_t      srcStr = vidupq_u16((uint32_t) 0, 1);
        uint16x8_t      dstStr = vidupq_u16((uint32_t) 0, 1);

        srcStr = srcStr * iSourceStride;
        dstStr = dstStr * iTargetStride;

        for (int32_t y = 0; y < ptCopySize->iHeight / 8; y++) {
            uint16x8_t      in = vldrhq_gather_shifted_offset_u16(phwSource, srcStr);
            srcStr = vaddq_n_u16(srcStr, (8 * iSourceStride));

            vstrhq_scatter_shifted_offset_u16(phwTarget, dstStr, in);
            dstStr = vaddq_n_u16(dstStr, (8 * iTargetStride));
        }

        phwSource++;
        phwTarget++;
    }
#else
    __asm volatile(
        "   clrm                {r2, r4}                               \n"
        "   vidup.u16           q0, r2, #1                             \n"
        "   vmul.i16            q2, q0, %[iSourceStride]               \n"
        "   vidup.u16           q1, r4, #1                             \n"
        "   vmul.i16            q3, q1, %[iTargetStride]               \n"

        "3:                                                            \n"
        /* outer loop, iterates over columns */
        /* size = ptCopySize->iWidth */
        "   vmov                q0, q2                                 \n"
        "   vmov                q1, q3                                 \n"

        /* inner loop, iterates over rows (size = ptCopySize->iHeight) */
        "   wlstp.16            lr, %[iHeight], 1f                     \n"
        ".p2align 2                                                    \n"
        "2:                                                            \n"
        "   vldrh.u16           q4, [%[phwSource], q0, uxtw #1]        \n"
        "   vadd.i16            q0, q0, %[iSourceStridex8]             \n"
        "   vstrh.16            q4, [%[phwTarget], q1, uxtw #1]        \n"
        "   vadd.i16            q1, q1, %[iTargetStridex8]             \n"
        "   letp                lr, 2b                                 \n"
        "1:                                                            \n"

        "   add.n               %[phwSource], #2                       \n"
        "   add.n               %[phwTarget], #2                       \n"
        "   subs                %[iWidth], #1                          \n"
        "   bne                 3b                                     \n"

        : [phwTarget] "+r"(phwTarget),  [phwSource] "+r"(phwSource)
        : [iHeight] "r"(ptCopySize->iHeight), [iWidth] "r" (ptCopySize->iWidth),
          [iSourceStride] "r" (iSourceStride),[iSourceStridex8] "r" (iSourceStride*8),
          [iTargetStride] "r" (iTargetStride),[iTargetStridex8] "r" (iTargetStride*8)
        : "r2", "r4", "q0", "q1", "q2", "q3", "q4", "memory", "r14", "cc");
#endif
}

static
void __arm_copy_32_mve_narrow(   uint32_t *pwSource,
                                                int32_t iSourceStride,
                                                uint32_t *pwTarget,
                                                int32_t iTargetStride,
                                                arm_2d_size_t *ptCopySize)
{
#ifdef USE_MVE_INTRINSICS
    for (int_fast32_t x = 0; x < ptCopySize->iWidth; x++) {
        uint32x4_t      srcStr = vidupq_u32((uint32_t) 0, 1);
        uint32x4_t      dstStr = vidupq_u32((uint32_t) 0, 1);

        srcStr = srcStr * iSourceStride;
        dstStr = dstStr * iTargetStride;

        for (int_fast32_t y = 0; y < ptCopySize->iHeight / 4; y++) {
            uint32x4_t      in = vldrwq_gather_shifted_offset_u32(pwSource, srcStr);
            srcStr = vaddq_n_u32(srcStr, (4 * iSourceStride));

            vstrwq_scatter_shifted_offset_u32(pwTarget, dstStr, in);
            dstStr = vaddq_n_u32(dstStr, (4 * iTargetStride));
        }

        pwSource++;
        pwTarget++;
    }
#else
    __asm volatile(

        "   clrm                {r2, r4}                              \n"
        "   vidup.u32           q0, r2, #1                            \n"
        "   vmul.i32            q2, q0, %[iSourceStride]              \n"
        "   vidup.u32           q1, r4, #1                            \n"
        "   vmul.i32            q3, q1, %[iTargetStride]              \n"

        "3:                                                           \n"
        /* outer loop, iterates over columns */
        /* size = ptCopySize->iWidth */
        "   vmov                q0, q2                                \n"
        "   vmov                q1, q3                                \n"

        /* inner loop, iterates over rows (size = ptCopySize->iHeight) */
        "   wlstp.32            lr, %[iHeight], 1f                    \n"
        ".p2align 2                                                   \n"
        "2:                                                           \n"
        "   vldrw.u32           q4, [%[pwSource], q0, uxtw #2]        \n"
        "   vadd.i32            q0, q0, %[iSourceStridex4]            \n"
        "   vstrw.32            q4, [%[pwTarget], q1, uxtw #2]        \n"
        "   vadd.i32            q1, q1, %[iTargetStridex4]            \n"
        "   letp                lr, 2b                                \n"
        "1:                                                           \n"

        "   add.n               %[pwSource], #4                       \n"
        "   add.n               %[pwTarget], #4                       \n"
        "   subs                %[iWidth], #1                         \n"
        "   bne                 3b                                    \n"

        : [pwTarget] "+r"(pwTarget),  [pwSource] "+r"(pwSource)
        : [iHeight] "r"(ptCopySize->iHeight), [iWidth] "r" (ptCopySize->iWidth),
          [iSourceStride] "r" (iSourceStride),[iSourceStridex4] "r" (iSourceStride*4),
          [iTargetStride] "r" (iTargetStride),[iTargetStridex4] "r" (iTargetStride*4)
        : "r2", "r4", "q0", "q1", "q2", "q3", "q4", "memory", "r14", "cc");
#endif
}


__OVERRIDE_WEAK
void __arm_2d_impl_rgb16_copy(   uint16_t *phwSource,
                                int16_t iSourceStride,
                                uint16_t *phwTarget,
                                int16_t iTargetStride,
                                arm_2d_size_t *ptCopySize)
{
    /*
     * 16-bit Narrow copy case:
     * use column copy with scatter / gather
     */
    if(ptCopySize->iWidth <= 4) {
        __arm_copy_16_mve_narrow(phwSource,
                                    iSourceStride,
                                    phwTarget,
                                    iTargetStride,
                                    ptCopySize);

    } else  if((((uint32_t)phwSource & 3) == 0) && (((uint32_t)phwTarget & 3) == 0)
        && ((iSourceStride & 3) == 0) && ((iTargetStride & 3) ==0)) {
        /*
         * source / dst & strides are 64-bit aligned
         * use scalar LDRD/STRD, faster than back to back vector VLDR/VSTR on M55
         */
        __asm volatile(
            "3:                                                          \n"

            "   mov                 r0, %[phwSource]                     \n"
            "   mov                 r1, %[phwTarget]                     \n"

            /* scalar version faster (no DTCM bank conflict)*/
            "   wls                 lr, %[iWidth], 1f                    \n"
            ".p2align 2                                                  \n"
            "2:                                                          \n"
            "   ldrd                r2, r3, [r0], #8                     \n"
            "   strd                r2, r3, [r1], #8                     \n"
            "   le                  lr, 2b                               \n"
            "1:                                                          \n"
           // tail
            "   wls                 lr, %[iWidthTail], 1f                \n"
            ".p2align 2                                                  \n"
            "2:                                                          \n"
            "   ldrh                r2, [r0], #2                         \n"
            "   strh                r2, [r1], #2                         \n"
            "   le                  lr, 2b                               \n"
            "1:                                                          \n"

            "   add                 %[phwSource], %[iSourceStride]       \n"
            "   add                 %[phwTarget], %[iTargetStride]       \n"
            "   subs                %[iHeight], #1                       \n"
            "   bne                 3b                                   \n"

            : [phwTarget] "+r"(phwTarget),  [phwSource] "+r"(phwSource)
            : [iHeight] "r"(ptCopySize->iHeight), [iWidth] "r" (ptCopySize->iWidth/4),
              [iWidthTail] "r" (ptCopySize->iWidth & 3),
              [iSourceStride] "r" (iSourceStride*sizeof(uint16_t)),
              [iTargetStride] "r" (iTargetStride*sizeof(uint16_t))
            : "r0", "r1", "r2", "r3", "q0", "memory", "r14", "cc"
            );
        }
        else
        {
        /*
         * generic column major 16-bit 2D copy
         */
        int32_t iWidth = ptCopySize->iWidth;
        int32_t iHeight = ptCopySize->iHeight;

        __asm volatile(
            "   mov                 r2, %[iHeight]                  \n"
            "3:                                                     \n"

            "   mov                 r0, %[phwSource]                \n"
            "   mov                 r1, %[phwTarget]                \n"

            "   wlstp.16            lr, %[iWidth], 1f               \n"
            ".p2align 2                                             \n"
            "2:                                                     \n"
            "   vldrh.u16           q0, [r0], #16                   \n"
            "   vstrh.16            q0, [r1], #16                   \n"
            "   letp                lr, 2b                          \n"
            "1:                                                     \n"
            "   add                 %[phwSource], %[iSourceStride]  \n"
            "   add                 %[phwTarget], %[iTargetStride]  \n"
            "   subs                r2, #1                          \n"
            "   bne                 3b                              \n"

            : [phwTarget] "+r"(phwTarget),  [phwSource] "+r"(phwSource)
            : [iHeight] "r"(iHeight), [iWidth] "r" (iWidth),
              [iSourceStride] "r" (iSourceStride*sizeof(uint16_t)),
              [iTargetStride] "r" (iTargetStride*sizeof(uint16_t))
            : "r0", "r1", "r2", "q0", "memory", "r14", "cc");
    }
}

__OVERRIDE_WEAK
 void __arm_2d_impl_rgb32_copy(   uint32_t *pwSource,
                                                int16_t iSourceStride,
                                                uint32_t *pwTarget,
                                                int16_t iTargetStride,
                                                arm_2d_size_t *ptCopySize)
{
    if(ptCopySize->iWidth <= 2) {
        /*
         * 32-bit Narrow copy case:
         * use column copy with scatter / gather
         */
        __arm_copy_32_mve_narrow(pwSource,
                                    iSourceStride,
                                    pwTarget,
                                    iTargetStride,
                                    ptCopySize);

    } else  if((((uint32_t)pwSource & 3) == 0) && (((uint32_t)pwTarget & 3) == 0)
        && ((iSourceStride & 3) == 0) && ((iTargetStride & 3) ==0)) {
        /*
         * source / dst & strides are 64-bit aligned
         * use scalar LDRD/STRD, faster than back to back vector VLDR/VSTR on M55
         */
        __asm volatile(
            "3:                                                          \n"
            "   mov                 r0, %[pwSource]                      \n"
            "   mov                 r1, %[pwTarget]                      \n"

            /* scalar version faster (no DTCM bank conflict)*/
            "   wls                 lr, %[iWidth], 1f                    \n"
            ".p2align 2                                                  \n"
            "2:                                                          \n"
            "   ldrd                r2, r3, [r0], #8                     \n"
            "   strd                r2, r3, [r1], #8                     \n"
            "   le                  lr, 2b                               \n"
            "1:                                                          \n"
           // tail
            "   wls                 lr, %[iWidthTail], 1f                \n"
            ".p2align 2                                                  \n"
            "2:                                                          \n"
            "   ldr                 r2, [r0], #4                         \n"
            "   str                 r2, [r1], #4                         \n"
            "   le                  lr, 2b                               \n"
            "1:                                                          \n"

            "   add                 %[pwSource], %[iSourceStride]        \n"
            "   add                 %[pwTarget], %[iTargetStride]        \n"
            "   subs                %[iHeight], #1                       \n"
            "   bne                 3b                                   \n"

            : [pwTarget] "+r"(pwTarget),  [pwSource] "+r"(pwSource)
            : [iHeight] "r"(ptCopySize->iHeight), [iWidth] "r" (ptCopySize->iWidth/2),
              [iWidthTail] "r" (ptCopySize->iWidth & 1),
              [iSourceStride] "r" (iSourceStride*sizeof(uint32_t)),
              [iTargetStride] "r" (iTargetStride*sizeof(uint32_t))
            : "r0", "r1", "r2", "r3", "q0", "memory", "r14", "cc"
            );
        }
        else
        {
        /*
         * generic column major 32-bit 2D copy
         */
        __asm volatile(
            "3:                                                     \n"

            "   mov                 r0, %[pwSource]                 \n"
            "   mov                 r1, %[pwTarget]                 \n"

            "   wlstp.32            lr, %[iWidth], 1f               \n"
            ".p2align 2                                             \n"
            "2:                                                     \n"
            "   vldrw.32           q0, [r0], #16                    \n"
            "   vstrw.32            q0, [r1], #16                   \n"
            "   letp                lr, 2b                          \n"
            "1:                                                     \n"
            "   add                 %[pwSource], %[iSourceStride]   \n"
            "   add                 %[pwTarget], %[iTargetStride]   \n"
            "   subs                %[iHeight], #1                  \n"
            "   bne                 3b                              \n"

            : [pwTarget] "+r"(pwTarget),  [pwSource] "+r"(pwSource)
            : [iHeight] "r"(ptCopySize->iHeight), [iWidth] "r" (ptCopySize->iWidth),
              [iSourceStride] "r" (iSourceStride*sizeof(uint32_t)),
              [iTargetStride] "r" (iTargetStride*sizeof(uint32_t))
            : "r0", "r1", "q0", "memory", "r14", "cc");
        }
}

/*----------------------------------------------------------------------------*
 * alpha blending                                                             *
 *----------------------------------------------------------------------------*/

__OVERRIDE_WEAK
void __arm_2d_impl_gray8_alpha_blending(uint8_t * __RESTRICT pSourceBase,
                                        int16_t iSourceStride,
                                        uint8_t * __RESTRICT pTargetBase,
                                        int16_t iTargetStride,
                                        arm_2d_size_t * __RESTRICT ptCopySize,
                                        uint_fast8_t chRatio)
{
    int_fast16_t    iHeight = ptCopySize->iHeight;
    int_fast16_t    iWidth = ptCopySize->iWidth;
    uint16_t        hwRatioCompl = 256 - chRatio;


    for (int_fast16_t y = 0; y < iHeight; y++) {

        const uint8_t *pSource = pSourceBase;
        uint8_t       *pTarget = pTargetBase;
        int32_t         blkCnt = iWidth;

#ifdef USE_MVE_INTRINSICS
        do {
            mve_pred16_t    tailPred = vctp16q(blkCnt);
            uint16x8_t      vecTgt = vldrbq_z_u16(pTarget, tailPred);
            uint16x8_t      vecSrc = vldrbq_z_u16(pSource, tailPred);

            vecTgt = vmulq_x(vecTgt, hwRatioCompl, tailPred);
            vecTgt = vmlaq_m(vecTgt, vecSrc, chRatio, tailPred);
            vecTgt  = vecTgt  >> 8;

            vstrbq_p_u16(pTarget , vecTgt , tailPred);

            pSource += 8;
            pTarget += 8;
            blkCnt -= 8;
        }
        while (blkCnt > 0);

#else
        __asm volatile(
            "   vldrb.u16               q0, [%[pTarget]]             \n"
            ".p2align 2                                              \n"
            "   wls                     lr, %[loopCnt], 1f           \n"
            "2:                                                      \n"
            "   vmul.u16                q0, q0, %[hwRatioCompl]      \n"
            "   vldrb.u16               q1, [%[pSource]], #8         \n"
            "   vmla.u16                q0, q1, %[chRatio]           \n"
            "   vldrb.u16               q2, [%[pTarget], #8]         \n"
            "   vshr.u16                q0, q0, #8                   \n"
            "   vstrb.u16               q0, [%[pTarget]], #8         \n"
            "   vmul.u16                q2, q2, %[hwRatioCompl]      \n"
            "   vldrb.u16               q1, [%[pSource]], #8         \n"
            "   vmla.u16                q2, q1, %[chRatio]           \n"
            "   vldrb.u16               q0, [%[pTarget], #8]         \n"
            "   vshr.u16                q2, q2, #8                   \n"
            "   vstrb.u16               q2, [%[pTarget]], #8         \n"
            "   le                      lr, 2b                       \n"
            "1:                                                      \n"
            /* tail */
            "   wlstp.16                lr, %[tail], 1f              \n"
            "2:                                                      \n"
            "   vmul.u16                q0, q0, %[hwRatioCompl]      \n"
            "   vldrb.u16               q1, [%[pSource]], #8         \n"
            "   vmla.u16                q0, q1, %[chRatio]           \n"
            "   vshr.u16                q1, q0, #8                   \n"
            "   vldrb.u16               q0, [%[pTarget], #8]         \n"
            "   vstrb.u16               q1, [%[pTarget]], #8         \n"
            "   letp                    lr, 2b                       \n"
            "1:                                                      \n"

            : [pTarget] "+r"(pTarget),  [pSource] "+r" (pSource)
            : [chRatio] "r" (chRatio), [hwRatioCompl] "r" (hwRatioCompl),
              [loopCnt] "r"(blkCnt/16), [tail] "r"(blkCnt & 0xf)
            :"q0", "q1", "q2", "memory", "r14");
#endif
        pSourceBase += (iSourceStride);
        pTargetBase += (iTargetStride);
   }
}

__OVERRIDE_WEAK
void __arm_2d_impl_gray8_alpha_blending_colour_keying(uint8_t * __RESTRICT pSourceBase,
                                                      int16_t iSourceStride,
                                                      uint8_t * __RESTRICT pTargetBase,
                                                      int16_t iTargetStride,
                                                      arm_2d_size_t *
                                                      __RESTRICT ptCopySize,
                                                      uint_fast8_t chRatio,
                                                      uint8_t Colour)
{
    int_fast16_t    iHeight = ptCopySize->iHeight;
    int_fast16_t    iWidth = ptCopySize->iWidth;
    uint16_t        hwRatioCompl = 256 - chRatio;


    for (int_fast16_t y = 0; y < iHeight; y++) {

        const uint8_t *pSource = pSourceBase;
        uint8_t       *pTarget = pTargetBase;
        int32_t         blkCnt = iWidth;

#ifdef USE_MVE_INTRINSICS
        do {
            mve_pred16_t    tailPred = vctp16q(blkCnt);
            uint16x8_t      vecTgt = vldrbq_z_u16(pTarget, tailPred);
            uint16x8_t      vecSrc = vldrbq_z_u16(pSource, tailPred);

            vecTgt = vmulq_x(vecTgt, hwRatioCompl, tailPred);
            vecTgt = vmlaq_m(vecTgt, vecSrc, chRatio, tailPred);
            vecTgt  = vecTgt  >> 8;

            vstrbq_p_u16(pTarget , vecTgt ,
                vcmpneq_m_n_u16(vecSrc, (uint16_t)Colour, tailPred));

            pSource += 8;
            pTarget += 8;
            blkCnt -= 8;
        }
        while (blkCnt > 0);

#else
        __asm volatile(
            "   vldrb.u16               q0, [%[pTarget]]             \n"
            ".p2align 2                                              \n"
            "   wls                     lr, %[loopCnt], 1f           \n"
            "2:                                                      \n"
            "   vmul.u16                q0, q0, %[hwRatioCompl]      \n"
            "   vldrb.u16               q1, [%[pSource]], #8         \n"
            "   vmla.u16                q0, q1, %[chRatio]           \n"
            "   vldrb.u16               q2, [%[pTarget], #8]         \n"
            "   vshr.u16                q0, q0, #8                   \n"
            "   vpt.u16                 ne, q1, %[Colour]            \n"
            "   vstrbt.u16              q0, [%[pTarget]], #8         \n"
            "   vmul.u16                q2, q2, %[hwRatioCompl]      \n"
            "   vldrb.u16               q1, [%[pSource]], #8         \n"
            "   vmla.u16                q2, q1, %[chRatio]           \n"
            "   vldrb.u16               q0, [%[pTarget], #8]         \n"
            "   vshr.u16                q2, q2, #8                   \n"
            "   vpt.u16                 ne, q1, %[Colour]            \n"
            "   vstrbt.u16              q2, [%[pTarget]], #8         \n"
            "   le                      lr, 2b                       \n"
            "1:                                                      \n"
            /* tail */
            "   wlstp.16                lr, %[tail], 1f              \n"
            "2:                                                      \n"
            "   vmul.u16                q0, q0, %[hwRatioCompl]      \n"
            "   vldrb.u16               q1, [%[pSource]], #8         \n"
            "   vmla.u16                q0, q1, %[chRatio]           \n"
            "   vshr.u16                q2, q0, #8                   \n"
            "   vldrb.u16               q0, [%[pTarget], #8]         \n"
            "   vpt.u16                 ne, q1, %[Colour]            \n"
            "   vstrbt.u16              q2, [%[pTarget]], #8         \n"
            "   letp                    lr, 2b                       \n"
            "1:                                                      \n"

            : [pTarget] "+r"(pTarget),  [pSource] "+r" (pSource)
            : [chRatio] "r" (chRatio), [hwRatioCompl] "r" (hwRatioCompl),
              [loopCnt] "r"(blkCnt/16), [tail] "r"(blkCnt & 0xf),
              [Colour] "r" (Colour)
            :"q0", "q1", "q2", "memory", "r14");
#endif
        pSourceBase += (iSourceStride);
        pTargetBase += (iTargetStride);
   }
}


__OVERRIDE_WEAK
void __arm_2d_impl_gray8_colour_filling_with_opacity(uint8_t * __restrict pTargetBase,
                                                     int16_t iTargetStride,
                                                     arm_2d_size_t *
                                                     __restrict ptCopySize,
                                                     uint8_t Colour, uint_fast8_t chRatio)
{
    int_fast16_t    iHeight = ptCopySize->iHeight;
    int_fast16_t    iWidth = ptCopySize->iWidth;
    uint16_t        hwRatioCompl = 256 - chRatio;
    uint16x8_t      vecSrc = vdupq_n_u16(Colour);

    for (int_fast16_t y = 0; y < iHeight; y++) {

        uint8_t       *pTarget = pTargetBase;
        int32_t        blkCnt = iWidth;

#ifdef USE_MVE_INTRINSICS
        do {
            mve_pred16_t    tailPred = vctp16q(blkCnt);
            uint16x8_t      vecTgt = vldrbq_z_u16(pTarget, tailPred);

            vecTgt = vmulq_x(vecTgt, hwRatioCompl, tailPred);
            vecTgt = vmlaq_m(vecTgt, vecSrc, chRatio, tailPred);
            vecTgt  = vecTgt  >> 8;

            vstrbq_p_u16(pTarget , vecTgt , tailPred);

            pTarget += 8;
            blkCnt -= 8;
        }
        while (blkCnt > 0);

#else
        __asm volatile(
            "   vldrb.u16               q0, [%[pTarget]]             \n"
            "   vmul.u16                q0, q0, %[hwRatioCompl]      \n"
            ".p2align 2                                              \n"
            "   wls                     lr, %[loopCnt], 1f           \n"
            "2:                                                      \n"

            "   vmla.u16                q0, %[vecSrc], %[chRatio]    \n"
            "   vldrb.u16               q2, [%[pTarget], #8]         \n"
            "   vshr.u16                q0, q0, #8                   \n"
            "   vmul.u16                q2, q2, %[hwRatioCompl]      \n"
            "   vstrb.u16               q0, [%[pTarget]], #8         \n"
            "   vmla.u16                q2, %[vecSrc], %[chRatio]    \n"
            "   vldrb.u16               q0, [%[pTarget], #8]         \n"
            "   vshr.u16                q2, q2, #8                   \n"
            "   vmul.u16                q0, q0, %[hwRatioCompl]      \n"
            "   vstrb.u16               q2, [%[pTarget]], #8         \n"
            "   le                      lr, 2b                       \n"
            "1:                                                      \n"
            /* tail */
            "   wlstp.16                lr, %[tail], 1f              \n"
            "2:                                                      \n"

            "   vmla.u16                q0, %[vecSrc], %[chRatio]    \n"
            "   vshr.u16                q2, q0, #8                   \n"
            "   vldrb.u16               q0, [%[pTarget], #8]         \n"
            "   vmul.u16                q0, q0, %[hwRatioCompl]      \n"
            "   vstrb.u16               q2, [%[pTarget]], #8         \n"
            "   letp                    lr, 2b                       \n"
            "1:                                                      \n"

            : [pTarget] "+r"(pTarget)
            : [chRatio] "r" (chRatio), [hwRatioCompl] "r" (hwRatioCompl),
              [loopCnt] "r"(blkCnt/16), [tail] "r"(blkCnt & 0xf),
              [vecSrc] "t" (vecSrc)
            :"q0", "q2", "memory", "r14");
#endif
        pTargetBase += (iTargetStride);
   }
}


__OVERRIDE_WEAK
void __arm_2d_impl_rgb565_alpha_blending(   uint16_t *phwSourceBase,
                                            int16_t iSourceStride,
                                            uint16_t *phwTargetBase,
                                            int16_t iTargetStride,
                                            arm_2d_size_t *ptCopySize,
                                            uint_fast8_t chRatio)
{
#ifdef USE_MVE_INTRINSICS
    int32_t         blkCnt;
    uint16_t        ratio1x8 = chRatio * 8;
    uint16_t        ratio1x4 = chRatio * 4;
    uint16_t        ratio2x8 = (256 - chRatio) * 8;
    uint16_t        ratio2x4 = (256 - chRatio) * 4;

    uint16x8_t      vecMaskR = vdupq_n_u16(0x001f);
    uint16x8_t      vecMaskG = vdupq_n_u16(0x003f);
    uint16x8_t      vecMaskBpck = vdupq_n_u16(0x00f8);
    uint16x8_t      vecMaskGpck = vdupq_n_u16(0x00fc);


    for (int32_t y = 0; y < ptCopySize->iHeight; y++) {
        const uint16_t *phwSource = phwSourceBase;
        uint16_t       *phwTarget = phwTargetBase;

        blkCnt = ptCopySize->iWidth;
        do {

            uint16x8_t      vecIn;
            uint16x8_t      vecR0, vecB0, vecG0;
            uint16x8_t      vecR1, vecB1, vecG1;

            /* unpack 1st stream */
            vecIn = vld1q(phwSource);
            vecR0 = vecIn & vecMaskR;

            vecB0 = vecIn >> 11;

            vecG0 = vecIn >> 5;
            vecG0 = vecG0 & vecMaskG;


            /* unpack 2nd stream */
            vecIn = vld1q(phwTarget);
            vecR1 = vecIn & vecMaskR;

            vecB1 = vecIn >> 11;

            vecG1 = vecIn >> 5;
            vecG1 = vecG1 & vecMaskG;


            /* merge */
            vecR0 = vecR0 * ratio1x8 + vecR1 * ratio2x8;
            vecR0 = vecR0 >> 8;

            vecG0 = vecG0 * ratio1x4 + vecG1 * ratio2x4;
            vecG0 = vecG0 >> 8;

            vecB0 = vecB0 * ratio1x8 + vecB1 * ratio2x8;
            vecB0 = vecB0 >> 8;


            /* pack */
            uint16x8_t      vOut = vecR0 >> 3 | vmulq((vecG0 & vecMaskGpck), 8)
                | vmulq((vecB0 & vecMaskBpck), 256);


            vst1q(phwTarget, vOut);

            phwSource += 8;
            phwTarget += 8;
            blkCnt -= 8;
        }
        while (blkCnt > 0);

        phwSourceBase += iSourceStride;
        phwTargetBase += iTargetStride;
    }

#else /* USE_MVE_INTRINSICS  */


    uint16_t        ratio1x8 = chRatio * 8;
    uint16_t        ratio1x4 = chRatio * 4;
    uint16_t        ratio2x8 = (256 - chRatio) * 8;
    uint16_t        ratio2x4 = (256 - chRatio) * 4;
    uint16x8_t      vecMaskR = vdupq_n_u16(0x001f);
    uint16x8_t      vecMaskG = vdupq_n_u16(0x003f);
    uint16x8_t      vecMaskBpck = vdupq_n_u16(0x00f8);
    uint32_t        iWidth = ptCopySize->iWidth;
    int32_t         row = ptCopySize->iHeight;
    uint16x8_t      scratch[1];

    vst1q((uint16_t *)scratch, vdupq_n_u16(0x00fc));

    do {
        const uint16_t *pSource = phwSourceBase;
        uint16_t       *pTarget = phwTargetBase;
        register unsigned loopCnt  __asm("lr");
        loopCnt = iWidth;

    __asm volatile(
        ".p2align 2                                              \n"
        "   vldrh.u16               q4, [%[pTarget]]             \n"
        "   vldrh.u16               q5, [%[pSource]], #16        \n"

        "   wlstp.16                lr, %[loopCnt], 1f           \n"
        "2:                                                      \n"
        // B target extraction
        // right shift by 5 (x 1/32) for M55 friendly
        // IV / Mul pipe interleaving
        "   vqdmulh.s16             q2, q4, %[rshft5]            \n"
        "   vand                    q7, q4, %[vecMaskR]          \n"

        "   vmul.i16                q6, q7, %[ratio2x8]          \n"
        // B source extraction
        "   vand                    q7, q5, %[vecMaskR]          \n"
        // B mix
        "   vmla.u16                q6, q7, %[ratio1x8]          \n"
        // G extraction
        "   vand                    q2, q2, %[vecMaskG]          \n"
        "   vshr.u16                q7, q5, #5                   \n"
        "   vmul.i16                q2, q2, %[ratio2x4]          \n"
        // G extraction
        "   vand                    q7, q7, %[vecMaskG]          \n"
        // G mix
        "   vmla.u16                q2, q7, %[ratio1x4]          \n"
        // R extraction
        "   vshr.u16                q4, q4, #11                  \n"
        "   vmul.i16                q7, q4, %[ratio2x8]          \n"
        // R extraction
        "   vshr.u16                q5, q5, #11                  \n"
        // R mix
        "   vmla.u16                q7, q5, %[ratio1x8]          \n"

        "   vshr.u16                q2, q2, #8                   \n"
        "   vldrh.16                q5, [%[scratch]]             \n"

        "   vand                    q2, q2, q5                   \n"
        // vmulq((vecG0 & 0x00fc), 8)
        "   vmul.i16                q2, q2, %[eight]             \n"
        "   vshr.u16                q4, q7, #8                   \n"
        // schedule next source load
        "   vldrh.u16               q5, [%[pSource]], #16        \n"
        "   vand                    q7, q4, %[vecMaskBpck]       \n"
        // pack R & G
        // vmulq((vecG0 & vecMaskGpck), 8) + vmulq((vecR0 & vecMaskRpck), 256)
        "   vmla.u16                q2, q7, %[twofiftysix]       \n"
        // downshift B ((vecB0 >> 8) >> 3)
        "   vshr.u16                q7, q6, #11                  \n"
        // schedule next target load (pre offset as target not imcrementred so far)
        "   vldrh.u16               q4, [%[pTarget], #16]        \n"
        // pack blue with R&G
        "   vorr                    q2, q2, q7                   \n"

        "   vstrh.16                q2, [%[pTarget]], #16        \n"
        "   letp                    lr, 2b                       \n"
        "1:                                                      \n"

        : [pSource] "+r"(pSource),  [pTarget] "+r" (pTarget), [loopCnt] "+r"(loopCnt)
        : [vecMaskR] "t" (vecMaskR), [vecMaskG] "t" (vecMaskG),
          [vecMaskBpck] "t" (vecMaskBpck),
          [ratio1x8] "r" (ratio1x8), [ratio2x8] "r" (ratio2x8),
          [ratio1x4] "r" (ratio1x4), [ratio2x4] "r" (ratio2x4),
          [eight] "r" (8), [twofiftysix] "r" (256), [rshft5] "r" (1024), [scratch] "r" (scratch)
        : "q2", "q4", "q5", "q6", "q7", "memory" );

        phwSourceBase += iSourceStride;
        phwTargetBase += iTargetStride;
    } while (--row);
#endif /* USE_MVE_INTRINSICS */
}


__OVERRIDE_WEAK
void __arm_2d_impl_rgb565_colour_filling_with_opacity(
                                        uint16_t *__RESTRICT pTargetBase,
                                        int16_t iTargetStride,
                                        arm_2d_size_t *__RESTRICT ptCopySize,
                                        uint16_t Colour,
                                        uint_fast8_t chRatio)
{
#ifdef USE_MVE_INTRINSICS
    int32_t         blkCnt;
    uint16_t        ratio1x8 = chRatio * 8;
    uint16_t        ratio1x4 = chRatio * 4;
    uint16_t        ratio2x8 = (256 - chRatio) * 8;
    uint16_t        ratio2x4 = (256 - chRatio) * 4;

    uint16x8_t      vecMaskR = vdupq_n_u16(0x001f);
    uint16x8_t      vecMaskG = vdupq_n_u16(0x003f);
    uint16x8_t      vecMaskBpck = vdupq_n_u16(0x00f8);
    uint16x8_t      vecMaskGpck = vdupq_n_u16(0x00fc);
    uint16x8_t      vecIn;
    uint16x8_t      vecColorR, vecColorB, vecColorG;

    /* unpack color & scale */
    vecIn = vdupq_n_u16(Colour);
    vecColorR = (vecIn & vecMaskR) * ratio1x8;

    vecColorB = (vecIn >> 11) * ratio1x8;

    vecColorG = vecIn >> 5;
    vecColorG = (vecColorG & vecMaskG) * ratio1x4;

    for (int32_t y = 0; y < ptCopySize->iHeight; y++) {
        uint16_t       *phwTarget = pTargetBase;

        blkCnt = ptCopySize->iWidth;
        do {
            uint16x8_t      vecR0, vecB0, vecG0;
            uint16x8_t      vecR1, vecB1, vecG1;

            /* unpack stream */
            vecIn = vld1q(phwTarget);
            vecR1 = vecIn & vecMaskR;

            vecB1 = vecIn >> 11;

            vecG1 = vecIn >> 5;
            vecG1 = vecG1 & vecMaskG;


            /* merge */
            vecR0 = vecColorR + vecR1 * ratio2x8;
            vecR0 = vecR0 >> 8;

            vecG0 = vecColorG + vecG1 * ratio2x4;
            vecG0 = vecG0 >> 8;

            vecB0 = vecColorB + vecB1 * ratio2x8;
            vecB0 = vecB0 >> 8;

            /* pack */
            uint16x8_t      vOut = vecR0 >> 3 | vmulq((vecG0 & vecMaskGpck), 8)
                | vmulq((vecB0 & vecMaskBpck), 256);

            vst1q(phwTarget, vOut);

            phwTarget += 8;
            blkCnt -= 8;
        }
        while (blkCnt > 0);

        pTargetBase += iTargetStride;
    }

#else /* USE_MVE_INTRINSICS  */

    uint16_t        ratio1x8 = chRatio * 8;
    uint16_t        ratio1x4 = chRatio * 4;
    uint16_t        ratio2x8 = (256 - chRatio) * 8;
    uint16_t        ratio2x4 = (256 - chRatio) * 4;
    uint16x8_t      vecMaskR = vdupq_n_u16(0x001f);
    uint16x8_t      vecMaskG = vdupq_n_u16(0x003f);
    uint16x8_t      vecMaskBpck = vdupq_n_u16(0x00f8);
    uint16x8_t      vecColorR, vecColorB, vecColorG;
    uint16x8_t      scratch[4];

    /* unpack color */
    uint16x8_t vecIn = vdupq_n_u16(Colour);
    vecColorR = vecIn & vecMaskR;
    vecColorB = vecIn >> 11;
    vecColorG = vecIn >> 5;
    vecColorG = vecColorG & vecMaskG;
    vst1q((uint16_t*)scratch, vecColorR * ratio1x8);
    vst1q((uint16_t*)&scratch[1], vecColorB * ratio1x8);
    vst1q((uint16_t*)&scratch[2], vecColorG * ratio1x4);
    vst1q((uint16_t*)&scratch[3], vdupq_n_u16(0x00fc));

    int32_t         row = ptCopySize->iHeight;
    do {

        uint16_t       *phwTarget = pTargetBase;
        register unsigned loopCnt  __asm("lr");
        loopCnt = ptCopySize->iWidth;

    __asm volatile(
        "   vldrh.u16               q4, [%[phwTarget]]           \n"

        "   wlstp.16                lr, %[loopCnt], 1f           \n"
        ".p2align 2                                              \n"
        "2:                                                      \n"
        // B target extraction
        "   vand                    q7, q4, %[vecMaskR]          \n"
        "   vldrh.u16               q6, [%[scratch]]             \n"
        "   vshr.u16                q2, q4, #5                   \n"

        // B mix
        "   vmla.u16                q6, q7, %[ratio2x8]          \n"
        // G extraction
        "   vand                    q7, q2, %[vecMaskG]          \n"

        // G extraction
        "   vldrh.u16               q2, [%[scratch], #32]        \n"
        // G mix
        "   vmla.u16                q2, q7, %[ratio2x4]          \n"

        "   vshr.u16                q4, q4, #11                  \n"
        // R extraction
        "   vldrh.u16               q7, [%[scratch], #16]        \n"
        "   vshr.u16                q2, q2, #8                   \n"
        // R mix
        "   vmla.u16                q7, q4, %[ratio2x8]          \n"
        "   vshr.u16                q4, q7, #8                   \n"

        // load duplicated 0xfc mask
        "   vldrh.u16               q7, [%[scratch], #48]        \n"
        "   vand                    q2, q2, q7                   \n"

        "   vmul.i16                q2, q2, %[eight]             \n"
        "   vand                    q7, q4, %[vecMaskBpck]       \n"

        // pack R & G
        "   vmla.u16                q2, q7, %[twofiftysix]       \n"
        // downshift B ((vecB0 >> 8) >> 3)
        "   vshr.u16                q7, q6, #11                  \n"
        // schedule next target load
        "   vldrh.u16               q4, [%[phwTarget], #16]      \n"
        // pack blue with R&G
        "   vorr                    q2, q2, q7                   \n"
        "   vstrh.16                q2, [%[phwTarget]], #16      \n"
        "   letp                    lr, 2b                       \n"
        "1:                                                      \n"
        : [phwTarget] "+r" (phwTarget), [loopCnt] "+r"(loopCnt)
        : [vecMaskR] "t" (vecMaskR), [vecMaskG] "t" (vecMaskG),
          [vecMaskBpck] "t" (vecMaskBpck),
          [ratio2x8] "r" (ratio2x8), [ratio2x4] "r" (ratio2x4),
          [eight] "r" (8), [twofiftysix] "r" (256), [scratch] "r" (scratch)
        : "q2", "q4", "q5", "q6", "q7", "memory" );

        pTargetBase += iTargetStride;
    } while (--row);

#endif /* USE_MVE_INTRINSICS */

}


__OVERRIDE_WEAK
void __arm_2d_impl_rgb565_alpha_blending_colour_keying(
                                                uint16_t * __RESTRICT phwSource,
                                                int16_t         iSourceStride,
                                                uint16_t * __RESTRICT phwTarget,
                                                int16_t         iTargetStride,
                                                arm_2d_size_t * __RESTRICT ptCopySize,
                                                uint_fast8_t    chRatio,
                                                uint32_t   hwColour)
{
#ifdef USE_MVE_INTRINSICS
    uint32_t        iHeight = ptCopySize->iHeight;
    uint32_t        iWidth = ptCopySize->iWidth;

    int32_t         blkCnt;
    uint16_t        ratio1x8 = chRatio * 8;
    uint16_t        ratio1x4 = chRatio * 4;
    uint16_t        ratio2x8 = (256 - chRatio) * 8;
    uint16_t        ratio2x4 = (256 - chRatio) * 4;

    uint16x8_t      vecMaskR = vdupq_n_u16(0x001f);
    uint16x8_t      vecMaskG = vdupq_n_u16(0x003f);
    uint16x8_t      vecMaskBpck = vdupq_n_u16(0x00f8);
    uint16x8_t      vecMaskGpck = vdupq_n_u16(0x00fc);

    for (uint32_t y = 0; y < iHeight; y++) {
        // - inconditional blending + predicated dst update
        const uint16_t *pSource = phwSource;
        uint16_t       *pTarget = phwTarget;
        blkCnt = iWidth >> 3;

        while (blkCnt > 0) {
            uint16x8_t      vecInSrc, vecInDst;
            uint16x8_t      vecR0, vecB0, vecG0;
            uint16x8_t      vecR1, vecB1, vecG1;

            /* unpack 1st stream */
            vecInSrc = vld1q(pSource);
            vecR0 = vandq(vecInSrc, vecMaskR);
            vecB0 = vshrq(vecInSrc, 11);
            vecG0 = vshrq(vecInSrc, 5);
            vecG0 = vandq(vecG0, vecMaskG);

            /* unpack 2nd stream */
            vecInDst = vld1q(pTarget);
            vecR1 = vandq(vecInDst, vecMaskR);
            vecB1 = vshrq(vecInDst, 11);
            vecG1 = vshrq(vecInDst, 5);
            vecG1 = vandq(vecG1, vecMaskG);

            /* merge */
            vecR0 = vmlaq(vmulq(vecR0, ratio1x8), vecR1, ratio2x8);
            vecR0 = vshrq(vecR0, 8);
            vecG0 = vmlaq(vmulq(vecG0, ratio1x4), vecG1, ratio2x4);
            vecG0 = vshrq(vecG0, 8);
            vecB0 = vmlaq(vmulq(vecB0, ratio1x8), vecB1, ratio2x8);
            vecB0 = vshrq(vecB0, 8);

            /* pack */
            uint16x8_t      vOut = vorrq(vshrq(vecR0, 3),
                                         vmulq(vandq(vecG0, vecMaskGpck), 8));

            vOut = vorrq(vOut, vmulq(vandq(vecB0, vecMaskBpck), 256));

            vst1q_p(pTarget, vOut, vcmpneq_n_s16(vecInSrc, hwColour));

            pSource += 8;
            pTarget += 8;
            blkCnt--;

        }

        blkCnt = iWidth & 7;
        if (blkCnt > 0U) {
            uint16x8_t      vecInSrc, vecInDst;
            uint16x8_t      vecR0, vecB0, vecG0;
            uint16x8_t      vecR1, vecB1, vecG1;

            /* unpack 1st stream */
            vecInSrc = vld1q(pSource);
            vecR0 = vandq(vecInSrc, vecMaskR);
            vecB0 = vshrq(vecInSrc, 11);
            vecG0 = vshrq(vecInSrc, 5);
            vecG0 = vandq(vecG0, vecMaskG);

            /* unpack 2nd stream */
            vecInDst = vld1q(pTarget);
            vecR1 = vandq(vecInDst, vecMaskR);
            vecB1 = vshrq(vecInDst, 11);
            vecG1 = vshrq(vecInDst, 5);
            vecG1 = vandq(vecG1, vecMaskG);

            /* merge */
            vecR0 = vmlaq(vmulq(vecR0, ratio1x8), vecR1, ratio2x8);
            vecR0 = vshrq(vecR0, 8);
            vecG0 = vmlaq(vmulq(vecG0, ratio1x4), vecG1, ratio2x4);
            vecG0 = vshrq(vecG0, 8);
            vecB0 = vmlaq(vmulq(vecB0, ratio1x8), vecB1, ratio2x8);
            vecB0 = vshrq(vecB0, 8);

            /* pack */
            uint16x8_t      vOut = vorrq(vshrq(vecR0, 3),
                                         vmulq(vandq(vecG0, vecMaskGpck), 8));

            vOut = vorrq(vOut,
                    vmulq(vandq(vecB0, vecMaskBpck), 256));

            vst1q_p(pTarget, vOut,
                    vcmpneq_m_n_s16(vecInSrc, hwColour, vctp16q(blkCnt)));

        }

        phwSource += iSourceStride;
        phwTarget += iTargetStride;
    }
#else
    uint32_t        iHeight = ptCopySize->iHeight;
    uint32_t        iWidth = ptCopySize->iWidth;

    uint16_t        ratio1x8 = chRatio * 8;
    uint16_t        ratio1x4 = chRatio * 4;
    uint16_t        ratio2x8 = (256 - chRatio) * 8;
    uint16_t        ratio2x4 = (256 - chRatio) * 4;

    uint16x8_t      vecMaskR = vdupq_n_u16(0x001f);
    uint16x8_t      vecMaskG = vdupq_n_u16(0x003f);
    uint16x8_t      vecMaskBpck = vdupq_n_u16(0x00f8);
    uint16x8_t      scratch[1];

    vst1q((uint16_t *)scratch, vdupq_n_u16(0x00fc));

    for (uint32_t y = 0; y < iHeight; y++) {

        const uint16_t *pSource = phwSource;
        uint16_t       *pTarget = phwTarget;
        register unsigned loopCnt  __asm("lr");
        loopCnt = iWidth;

    __asm volatile(
        ".p2align 2                                              \n"
        "   vldrh.u16               q4, [%[pTarget]]             \n"
        "   vldrh.u16               q5, [%[pSource]], #16        \n"
        "   vand                    q7, q4, %[vecMaskR]          \n"
        "   wlstp.16                lr, %[loopCnt], 1f           \n"
        "2:                                                      \n"
        // B target extraction
        "   vshr.u16                q2, q4, #5                   \n"
        "   vmul.i16                q6, q7, %[ratio2x8]          \n"
        // B source extraction
        "   vand                    q7, q5, %[vecMaskR]          \n"
        // B mix
        "   vmla.u16                q6, q7, %[ratio1x8]          \n"
        // G extraction
        "   vand                    q2, q2, %[vecMaskG]          \n"
        "   vshr.u16                q7, q5, #5                   \n"
        "   vmul.i16                q2, q2, %[ratio2x4]          \n"
        // G extraction
        "   vand                    q7, q7, %[vecMaskG]          \n"
        // G mix
        "   vmla.u16                q2, q7, %[ratio1x4]          \n"
        // R extraction
        "   vshr.u16                q4, q4, #11                  \n"
        "   vmul.i16                q7, q4, %[ratio2x8]          \n"
        // R extraction
        "   vshr.u16                q5, q5, #11                  \n"
        // R mix
        "   vmla.u16                q7, q5, %[ratio1x8]          \n"

        "   vshr.u16                q2, q2, #8                   \n"
        "   vldrh.16                q5, [%[scratch]]             \n"

        "   vand                    q2, q2, q5                   \n"
        // vmulq((vecG0 & 0x00fc), 8)
        "   vmul.i16                q2, q2, %[eight]             \n"
        "   vshr.u16                q4, q7, #8                   \n"
        // schedule next source load
        "   vldrh.u16               q5, [%[pSource]], #16        \n"
        "   vand                    q7, q4, %[vecMaskBpck]       \n"
        // pack R & G
        // vmulq((vecG0 & vecMaskGpck), 8) + vmulq((vecR0 & vecMaskRpck), 256)
        "   vmla.u16                q2, q7, %[twofiftysix]       \n"
        // downshift B ((vecB0 >> 8) >> 3)
        "   vshr.u16                q7, q6, #11                  \n"
        // schedule next target load (pre offset as target not imcrementred so far)
        "   vldrh.u16               q4, [%[pTarget], #16]        \n"
        // pack blue with R&G
        "   vorr                    q2, q2, q7                   \n"
        "   vldrh.u16               q6, [%[pSource], #-32]       \n"
        "   vand                    q7, q4, %[vecMaskR]          \n"
        "   vpt.u16                 ne, q6, %[hwColour]          \n"
        "   vstrht.16               q2, [%[pTarget]], #16        \n"
        "   letp                    lr, 2b                       \n"
        "1:                                                      \n"

        : [pSource] "+r"(pSource),  [pTarget] "+r" (pTarget), [loopCnt] "+r"(loopCnt)
        : [vecMaskR] "t" (vecMaskR), [vecMaskG] "t" (vecMaskG),
          [vecMaskBpck] "t" (vecMaskBpck),
          [ratio1x8] "r" (ratio1x8), [ratio2x8] "r" (ratio2x8),
          [ratio1x4] "r" (ratio1x4), [ratio2x4] "r" (ratio2x4),
          [eight] "r" (8), [twofiftysix] "r" (256), [hwColour] "r" (hwColour), [scratch] "r" (scratch)
        : "q2", "q4", "q5", "q6", "q7", "memory" );

        phwSource += (iSourceStride);
        phwTarget += (iTargetStride);
    }
#endif
}


__OVERRIDE_WEAK
void __arm_2d_impl_cccn888_alpha_blending(   uint32_t *pwSourceBase,
                                            int16_t iSourceStride,
                                            uint32_t *pwTargetBase,
                                            int16_t iTargetStride,
                                            arm_2d_size_t *ptCopySize,
                                            uint_fast8_t chRatio)
{
#ifdef USE_MVE_INTRINSICS
    uint16_t        chRatioCompl = 256 - (uint16_t) chRatio;
    int32_t         blkCnt;
    int32_t         row = ptCopySize->iHeight;

    while (row > 0) {

        const uint32_t *pwSource = pwSourceBase;
        uint32_t       *pwTarget = pwTargetBase;
        /* byte extraction into 16-bit vector */
        uint16x8_t      vecSrc = vldrbq_u16((const uint8_t *)pwSource);
        uint16x8_t      vecTrg = vldrbq_u16((const uint8_t *)pwTarget);

        pwSource += 2;
        blkCnt = ptCopySize->iWidth;

        while (blkCnt > 0) {
            vstrbq_u16((const uint8_t *)pwTarget,
                       vmlaq(vmulq(vecSrc, chRatio), vecTrg, chRatioCompl) >> 8);

            pwTarget += 2;

            vecSrc = vldrbq_u16((const uint8_t *)pwSource);
            vecTrg = vldrbq_u16((const uint8_t *)pwTarget);
            pwSource += 2;
            blkCnt -= 2;
        }

        pwSourceBase += iSourceStride;
        pwTargetBase += iTargetStride;
        row--;
    }
#else
    uint16_t        chRatioCompl = 256 - (uint16_t) chRatio;
    register unsigned blkCnt  __asm("lr");
    int32_t row = ptCopySize->iHeight;

    while(row > 0)
    {
        blkCnt = ptCopySize->iWidth*4;
        const uint32_t *pwSource = pwSourceBase;
        uint32_t *pwTarget = pwTargetBase;

        __asm volatile(
            "   vldrb.u16               q0, [%[pwSource]], #8          \n"
            "   vldrb.u16               q1, [%[pwTarget]]              \n"

            "   wlstp.16                lr, %[loopCnt], 1f             \n"
            "2:                                                        \n"
            "   vmul.u16                q2, q0, %[chRatio]             \n"
            "   vldrb.u16               q0, [%[pwSource]], #8          \n"
            "   vmla.u16                q2, q1, %[chRatioCompl]        \n"
            "   vldrb.u16               q1, [%[pwTarget], #8]          \n"
            "   vshr.u16                q2, q2, #8                     \n"
            "   vstrb.16                q2, [%[pwTarget]], #8          \n"
            "   letp                    lr, 2b                         \n"
            "1:                                                        \n"

            : [pwSource] "+l"(pwSource),  [pwTarget] "+l"(pwTarget),
              [loopCnt] "+r"(blkCnt)
            : [chRatio] "r" (chRatio), [chRatioCompl] "r" (chRatioCompl)
            : "q0", "q1", "q2", "memory" );

        pwSourceBase += iSourceStride;
        pwTargetBase += iTargetStride;
        row--;
    }
#endif
}


__OVERRIDE_WEAK
void __arm_2d_impl_cccn888_colour_filling_with_opacity(
                                        uint32_t *__RESTRICT pTargetBase,
                                        int16_t iTargetStride,
                                        arm_2d_size_t *__RESTRICT ptCopySize,
                                        uint32_t Colour,
                                        uint_fast8_t chRatio)
{
#ifdef USE_MVE_INTRINSICS
    uint16_t        chRatioCompl = 256 - (uint16_t) chRatio;
    int32_t         blkCnt;
    int32_t         row = ptCopySize->iHeight;
    uint32_t        scratch[2];
    uint16x8_t      vColor;

    scratch[0] = scratch[1] = Colour;
    vColor = vldrbq_u16((uint8_t *) scratch);
    vColor = vColor * (uint16_t)chRatio;

    while (row > 0) {
        uint32_t       *pTarget = pTargetBase;
        blkCnt = ptCopySize->iWidth;

        while (blkCnt > 0) {
            /* byte extraction into 16-bit vector */
            uint16x8_t      vecTrg = vldrbq_u16((uint8_t *)pTarget);

            vstrbq_u16((uint8_t *)pTarget, vmlaq(vColor, vecTrg, chRatioCompl) >> 8);

            pTarget += 2;
            blkCnt -= 2;
        }
        pTargetBase += iTargetStride;
        row--;
    }
#else /* USE_MVE_INTRINSICS  */

    uint16_t        chRatioCompl = 256 - (uint16_t) chRatio;
    int32_t         blkCnt;
    int32_t         row = ptCopySize->iHeight;
    uint32_t        scratch[2];
    uint16x8_t      vColor;

    scratch[0] = scratch[1] = Colour;
    vColor = vldrbq_u16((uint8_t *) scratch);
    vColor = vColor * (uint16_t)chRatio;

    while (row > 0) {
        uint32_t       *pTarget = pTargetBase;
        blkCnt = ptCopySize->iWidth*4;

    __asm volatile(
        /* preload */
        "   vldrb.u16               q1, [%[pTarget]]               \n"

        "   wlstp.16                lr, %[loopCnt], 1f             \n"
        ".p2align 2                                                \n"
        "2:                                                        \n"
        "   vmov                    q2, %[vColor]                  \n"
        "   vmla.u16                q2, q1, %[chRatioCompl]        \n"
        "   vldrb.u16               q1, [%[pTarget], #8]           \n"
        "   vshr.u16                q2, q2, #8                     \n"
        "   vstrb.16                q2, [%[pTarget]], #8           \n"
        "   letp                    lr, 2b                         \n"
        "1:                                                        \n"
        : [pTarget] "+l"(pTarget)
        : [loopCnt] "r"(blkCnt), [chRatioCompl] "r" (chRatioCompl), [vColor] "t" (vColor)
        : "q0", "q1", "q2", "memory" );

        pTargetBase += iTargetStride;
        row--;
    }

#endif /* USE_MVE_INTRINSICS */
}

__OVERRIDE_WEAK
void __arm_2d_impl_cccn888_alpha_blending_colour_keying(uint32_t * __RESTRICT pSourceBase,
                                                       int16_t iSourceStride,
                                                       uint32_t * __RESTRICT pTargetBase,
                                                       int16_t iTargetStride,
                                                       arm_2d_size_t *
                                                       __RESTRICT ptCopySize,
                                                       uint_fast8_t chRatio,
                                                       uint32_t Colour)
{
    int_fast16_t    iHeight = ptCopySize->iHeight;
    int_fast16_t    iWidth = ptCopySize->iWidth;
    uint16_t        chRatioCompl = 256 - chRatio;


    for (int_fast16_t y = 0; y < iHeight; y++) {
        const uint32_t *pSource = pSourceBase;
        uint32_t       *pTarget = pTargetBase;

#ifdef USE_MVE_INTRINSICS
        int32_t         blkCnt = iWidth;

        do {
            mve_pred16_t    p = vctp32q(blkCnt);

            uint8x16_t      vSrc8 = vld1q_z(pSource, p);
            uint8x16_t      vTrg8 = vld1q_z(pTarget, p);

            /* 16-bit expansion A/G src pixels */
            uint16x8_t      vSrc16b = vmovlbq_x(vSrc8, p);
            /* 16-bit expansion R/B src pixels */
            uint16x8_t      vSrc16t = vmovltq_x(vSrc8, p);

            /* 16-bit expansion A/G target pixels */
            uint16x8_t      vTrg16b = vmovlbq_x(vTrg8, p);
            /* 16-bit expansion R/B target pixels */
            uint16x8_t      vTrg16t = vmovltq_x(vTrg8, p);

            /* A/G blending */
            int16x8_t       vecOutb = vmlaq_m(vmulq_x(vSrc16b, chRatio, p), vTrg16b, chRatioCompl, p);
            /* R/B blending */
            int16x8_t       vecOutt = vmlaq_m(vmulq_x(vSrc16t, chRatio, p), vTrg16t, chRatioCompl, p);

            /* merge into 8-bit vector */
            int8x16_t       vecOut8 = vuninitializedq_s8();

            vecOut8 = vqshrnbq_m_n_s16(vecOut8, vecOutb, 8, p);
            vecOut8 = vqshrntq_m_n_s16(vecOut8, vecOutt, 8, p);

            // update if (*pSourceBase != Colour)
            vst1q_p_u32(pTarget, (uint32x4_t) vecOut8,
                        vcmpneq_m_n_u32((uint32x4_t) vSrc8, Colour, p));

            pSource += 4;
            pTarget += 4;
            blkCnt -= 4;
        }
        while (blkCnt > 0);

#else // USE_MVE_INTRINSICS

    __asm volatile (
        ".p2align 2                                 \n"
        /* preload uint32x4_t target vector */
        "   vldrw.u32       q2, [%[targ]]           \n"

        "   wlstp.32        lr, %[loopCnt], 1f      \n"
        "2:                                         \n"
        /* 16-bit expansion A/G target pixels */
        "   vmovlb.u8       q3, q2                  \n"
        "   vldrw.u32       q0, [%[src]], #16       \n"
        /* 16-bit expansion A/G source pixels */
        "   vmovlb.u8       q1, q0                  \n"
        "   vmul.i16        q1, q1, %[ratio]        \n"
        /* 16-bit expansion R/B target pixels */
        "   vmovlt.u8       q2, q2                  \n"
        /* A/G blending */
        "   vmla.u16        q1, q3, %[ratioCmp]     \n"
        /* 16-bit expansion R/B source pixels */
        "   vmovlt.u8       q3, q0                  \n"
        "   vmul.i16        q3, q3, %[ratio]        \n"
        /* merge A/G into 8-bit vector */
        "   vqshrnb.s16     q1, q1, #8              \n"
        /* R/B blending */
        "   vmla.u16        q3, q2, %[ratioCmp]     \n"
        /* preload next target */
        "   vldrw.u32       q2, [%[targ], #16]      \n"
        /* merge R/B into 8-bit vector */
        "   vqshrnt.s16     q1, q3, #8              \n"
        /* update if (*pSourceBase != Colour) */
        "   vpt.i32         ne, q0, %[color]        \n"
        "   vstrwt.32       q1, [%[targ]], #16      \n"
        "   letp            lr, 2b                  \n"
        "1:                                         \n"
        :[targ] "+r" (pTarget), [src] "+r" (pSource)
        :[loopCnt] "r" (iWidth), [ratio] "r" (chRatio),
         [ratioCmp] "r" (chRatioCompl), [color] "r" (Colour)
        :"r14", "q0", "q1", "q2", "q3", "memory");
#endif
        pSourceBase += (iSourceStride);
        pTargetBase += (iTargetStride);
    }
}


__OVERRIDE_WEAK
void __arm_2d_impl_rgb565_alpha_blending_direct(const uint16_t *phwSource,
                                                const uint16_t *phwBackground,
                                                uint16_t *phwDestination,
                                                uint32_t wPixelCount,
                                                uint_fast8_t chRatio)
{
#ifdef USE_MVE_INTRINSICS
    int32_t         blkCnt;
    uint16_t        ratio1x8 = chRatio * 8;
    uint16_t        ratio1x4 = chRatio * 4;
    uint16_t        ratio2x8 = (256 - chRatio) * 8;
    uint16_t        ratio2x4 = (256 - chRatio) * 4;

    uint16x8_t      vecMaskR = vdupq_n_u16(0x001f);
    uint16x8_t      vecMaskG = vdupq_n_u16(0x003f);
    uint16x8_t      vecMaskBpck = vdupq_n_u16(0x00f8);
    uint16x8_t      vecMaskGpck = vdupq_n_u16(0x00fc);

    blkCnt = wPixelCount;
    do {

        uint16x8_t      vecIn;
        uint16x8_t      vecR0, vecB0, vecG0;
        uint16x8_t      vecR1, vecB1, vecG1;

        /* unpack 1st stream */
        vecIn = vld1q(phwSource);
        phwSource += 8;

        vecR0 = vecIn & vecMaskR;

        vecB0 = vecIn >> 11;

        vecG0 = vecIn >> 5;
        vecG0 = vecG0 & vecMaskG;


        /* unpack 2nd stream */
        vecIn = vld1q(phwBackground);
        phwBackground += 8;

        vecR1 = vecIn & vecMaskR;

        vecB1 = vecIn >> 11;

        vecG1 = vecIn >> 5;
        vecG1 = vecG1 & vecMaskG;


        /* merge */
        vecR0 = vecR0 * ratio1x8 + vecR1 * ratio2x8;
        vecR0 = vecR0 >> 8;

        vecG0 = vecG0 * ratio1x4 + vecG1 * ratio2x4;
        vecG0 = vecG0 >> 8;

        vecB0 = vecB0 * ratio1x8 + vecB1 * ratio2x8;
        vecB0 = vecB0 >> 8;


        /* pack */
        uint16x8_t      vOut =
            vecR0 >> 3 | vmulq((vecG0 & vecMaskGpck), 8)
            | vmulq((vecB0 & vecMaskBpck), 256);

        vst1q(phwDestination, vOut);
        phwDestination += 8;

        blkCnt -= 8;
    }
    while (blkCnt > 0);

#else /* USE_MVE_INTRINSICS */

    uint16_t        ratio1x8 = chRatio * 8;
    uint16_t        ratio1x4 = chRatio * 4;
    uint16_t        ratio2x8 = (256 - chRatio) * 8;
    uint16_t        ratio2x4 = (256 - chRatio) * 4;
    uint16x8_t      vecMaskR = vdupq_n_u16(0x001f);
    uint16x8_t      vecMaskG = vdupq_n_u16(0x003f);
    uint16x8_t      vecMaskBpck = vdupq_n_u16(0x00f8);

     register unsigned loopCnt  __asm("lr") = (wPixelCount);

    __asm volatile(

        "   vldrh.u16               q4, [%[in2]], #16            \n"
        "   vmov.i16                q6, #0x00fc                  \n"
        "   vstrw.32                q6, [sp]                     \n"
        "   vldrh.u16               q5, [%[in1]], #16            \n"
        "   wlstp.16                lr, %[loopCnt], 1f           \n"
        "2:                                                      \n"

        "   vand                    q6, q4, %[vecMaskR]          \n"
        "   vmul.i16                q6, q6, %[ratio2x8]          \n"
        "   vshr.u16                q2, q4, #5                   \n"
        "   vand                    q7, q5, %[vecMaskR]          \n"
        "   vmla.u16                q6, q7, %[ratio1x8]          \n"
        "   vand                    q2, q2, %[vecMaskG]          \n"
        "   vshr.u16                q7, q5, #5                   \n"
        "   vmul.i16                q2, q2, %[ratio2x4]          \n"
        "   vand                    q7, q7, %[vecMaskG]          \n"
        "   vmla.u16                q2, q7, %[ratio1x4]          \n"
        "   vshr.u16                q4, q4, #11                  \n"
        "   vmul.i16                q7, q4, %[ratio2x8]          \n"
        "   vshr.u16                q5, q5, #11                  \n"
        "   vshr.u16                q2, q2, #8                   \n"
        "   vmla.u16                q7, q5, %[ratio1x8]          \n"

        //  "   vmov.i16                 q6, #0x00fc  \n"
        "   vshr.u16                q7, q7, #8                   \n"
        //  "   vmov.i16                 q6, #0x00fc  \n"
        /* load 0x00fc instead of mov for better overlap opportunity */
        "   vldrw.u32               q4, [sp]                     \n"
        "   vand                    q2, q2, q4                   \n"
        "   vmul.i16                q2, q2, %[eight]             \n"
        "   vand                    q4, q7, %[vecMaskBpck]       \n" // Q7 = vecB0
        "   vldrh.u16               q5, [%[in1]], #16            \n"
        "   vmla.u16                q2, q4, %[twofiftysix]       \n"
        // (vecR0 >> 3) >> 8
        "   vshr.u16                q6, q6, #11                  \n"
        "   vldrh.u16               q4, [%[in2]], #16            \n"
        "   vorr                    q2, q2, q6                   \n"
        "   vstrh.16                q2, [%[out]], #16            \n"
        "   letp                    lr, 2b                       \n"
        "1:                                                      \n"

        : [in1] "+r"(phwSource),  [in2] "+r"(phwBackground),
          [out] "+r" (phwDestination), [loopCnt] "+r"(loopCnt)
        : [vecMaskR] "t" (vecMaskR), [vecMaskG] "t" (vecMaskG),
          [vecMaskBpck] "t" (vecMaskBpck),
          [ratio1x8] "r" (ratio1x8), [ratio2x8] "r" (ratio2x8),
          [ratio1x4] "r" (ratio1x4), [ratio2x4] "r" (ratio2x4),
          [eight] "r" (8), [twofiftysix] "r" (256)
        : "q2", "q4", "q5", "q6", "q7", "memory" );

#endif
}

__OVERRIDE_WEAK
void __arm_2d_impl_cccn888_alpha_blending_direct(const uint32_t *pwSource,
                                                const uint32_t *pwBackground,
                                                uint32_t *pwDestination,
                                                uint32_t wPixelCount,
                                                uint_fast8_t chRatio)
{
#ifdef USE_MVE_INTRINSICS
    int32_t         blkCnt;
    uint16_t        chRatioCompl = 256 - (uint16_t) chRatio;
    uint16x8_t      vecSrc, vecBckg;


    vecSrc = vldrbq_u16((uint8_t const *) pwSource);
    pwSource += 2;
    vecBckg = vldrbq_u16((uint8_t const *) pwBackground);
    pwBackground += 2;


    blkCnt = wPixelCount;
    do {
        uint16x8_t      vecOut;

        vecOut = vmulq_n_u16(vecSrc, (uint16_t) chRatio);
        vecSrc = vldrbq_u16((uint8_t const *) pwSource);
        pwSource += 2;

        vecOut = vmlaq_n_u16(vecOut, vecBckg, chRatioCompl);
        vecBckg = vldrbq_u16((uint8_t const *) pwBackground);
        pwBackground += 2;

        vecOut = vecOut >> 8;

        vstrbq_u16((uint8_t *) pwDestination, vecOut);
        pwDestination += 2;

        blkCnt -= 2;
    }
    while (blkCnt > 0);

#else /* USE_MVE_INTRINSICS */
    uint16_t        chRatioCompl = 256 - (uint16_t) chRatio;
    register unsigned blkCnt  __asm("lr") = (wPixelCount * 4);

    __asm volatile(
        "   vldrb.u16               q0, [%[pwSource]], #8          \n"
        "   vldrb.u16               q1, [%[pwBackg]], #8           \n"

        "   wlstp.16                lr, %[loopCnt], 1f             \n"
        "2:                                                        \n"
        "   vmul.u16                q2, q0, %[chRatio]             \n"
        "   vldrb.u16               q0, [%[pwSource]], #8          \n"
        "   vmla.u16                q2, q1, %[chRatioCompl]        \n"
        "   vldrb.u16               q1, [%[pwBackg]], #8           \n"
        "   vshr.u16                q2, q2, #8                     \n"
        "   vstrb.16                q2, [%[pwDest]], #8            \n"
        "   letp                    lr, 2b                         \n"
        "1:                                                        \n"

        : [pwSource] "+l"(pwSource),  [pwBackg] "+l"(pwBackground),
          [pwDest] "+l" (pwDestination), [loopCnt] "+r"(blkCnt)
        : [chRatio] "r" (chRatio), [chRatioCompl] "r" (chRatioCompl)
        : "q0", "q1", "q2", "memory" );
#endif

}


static
mve_pred16_t arm_2d_is_point_vec_inside_region_s16(const arm_2d_region_t * ptRegion,
                                               const arm_2d_point_s16x8_t * ptPoint)
{
    mve_pred16_t    p0 = vcmpgeq(ptPoint->X, ptRegion->tLocation.iX);
    p0 = vcmpgeq_m(ptPoint->Y, ptRegion->tLocation.iY, p0);
    p0 = vcmpltq_m(ptPoint->X, ptRegion->tLocation.iX + ptRegion->tSize.iWidth, p0);
    p0 = vcmpltq_m(ptPoint->Y, ptRegion->tLocation.iY + ptRegion->tSize.iHeight, p0);

    return p0;
}

static
mve_pred16_t arm_2d_is_point_vec_inside_region_s32(const arm_2d_region_t * ptRegion,
                                               const arm_2d_point_s32x4_t * ptPoint)
{
    mve_pred16_t    p0 = vcmpgeq_n_s32(ptPoint->X, ptRegion->tLocation.iX);
    p0 = vcmpgeq_m_n_s32(ptPoint->Y, ptRegion->tLocation.iY, p0);
    p0 = vcmpltq_m_n_s32(ptPoint->X, ptRegion->tLocation.iX + ptRegion->tSize.iWidth, p0);
    p0 = vcmpltq_m_n_s32(ptPoint->Y, ptRegion->tLocation.iY + ptRegion->tSize.iHeight, p0);

    return p0;
}


/**
  @brief         return 3 vector of 16-bit channels (8-bit widened) taken from a memory reference
  @param[in]     pMem           pointer to packed 8-bit channel
  @param[out]    R              vector of 16-bit widened R channel
  @param[out]    G              vector of 16-bit widened G channel
  @param[out]    B              vector of 16-bit widened B channel
 */
void __arm_2d_unpack_rgb888_from_mem(const uint8_t * pMem, uint16x8_t * R, uint16x8_t * G,
                                     uint16x8_t * B)
{
    uint16x8_t      sg = vidupq_n_u16(0, 4);

    *R = vldrbq_gather_offset_u16(pMem, sg);
    *G = vldrbq_gather_offset_u16(pMem + 1, sg);
    *B = vldrbq_gather_offset_u16(pMem + 2, sg);
}

/**
  @brief         interleave 3 x 16-bit widened vectors into 8-bit memory reference
                 (4th channel untouched)
  @param[in]     pMem           pointer to packed 8-bit channel
  @param[in]     R              vector of 16-bit widened R channel
  @param[in]     G              vector of 16-bit widened G channel
  @param[in]     B              vector of 16-bit widened B channel
 */
void __arm_2d_pack_rgb888_to_mem(uint8_t * pMem, uint16x8_t R, uint16x8_t G, uint16x8_t B)
{
    uint16x8_t      sg = vidupq_n_u16(0, 4);

    vstrbq_scatter_offset_u16(pMem, sg, R);
    vstrbq_scatter_offset_u16(pMem + 1, sg, G);
    vstrbq_scatter_offset_u16(pMem + 2, sg, B);
    //vstrbq_scatter_offset_u16(pMem + 3, sg, vdupq_n_u16(0));
}


/**
  unpack vectors of 8-bit widened pixels read from a input 2D coordinates if fits inside the region of
  interest or alternative target pixel if content matches color mask
  (vector of packed pixels & region of interest name implicit and fixed to respectively
  vTarget and ptOrigValidRegion)
  Update global predictor tracking region fit & color mask comparison.
 */
#define __ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vecX, vecY, ptVal8, MaskColour, pGlb)              \
        arm_2d_point_s16x8_t vPoint = {.X = vecX,.Y = vecY };                                       \
        /* set vector predicate if point is inside the region */                                    \
        mve_pred16_t    p =                                                                         \
            arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);                      \
        pGlb |= p;                                                                                  \
        /* prepare vector of point offsets */                                                       \
        int16_t         correctionOffset = vminvq_s16(0x7fff, vPoint.Y) - 1;                        \
        uint16x8_t      ptOffs = vPoint.X + (vPoint.Y - correctionOffset) * iOrigStride;            \
                                                                                                    \
        /* base pointer update to compensate offset */                                              \
        uint8_t       *pOriginCorrected = pOrigin + (correctionOffset * iOrigStride);               \
        /* retrieve all point values */                                                             \
        ptVal8 =                                                                                    \
            vldrbq_gather_offset_z_u16(pOriginCorrected, ptOffs, predTail);                         \
                                                                                                    \
        /* combine 2 predicates set to true if point is in the region & values */                   \
        /*  different from color mask */                                                            \
        p = vcmpneq_m_n_u16(ptVal8, MaskColour, p);                                                 \
        pGlb |= p;                                                                                  \
        ptVal8 = vpselq_u16(ptVal8, vTarget, p);

/**
  unpack vectors of pixels read from a input 2D coordinates if fits inside the region of
  interest or alternative target pixel if content matches color mask
  (vector of packed pixels & region of interest name implicit and fixed to respectively
  vTarget and ptOrigValidRegion)
  Update global predictor tracking region fit & color mask comparison.
 */

#define __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vecX, vecY, R, G, B, MaskColour, pGlb)                       \
        arm_2d_point_s16x8_t vPoint = {.X = vecX,.Y = vecY };                                              \
        /* set vector predicate if point is inside the region */                                           \
        mve_pred16_t    p =                                                                                \
            arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);                             \
        pGlb |= p;                                                                                         \
                                                                                                           \
        /* prepare vector of point offsets */                                                              \
        uint16x8_t      ptOffs = vPoint.X + vPoint.Y * iOrigStride;                                        \
        /* retrieve all point values */                                                                    \
        uint16x8_t      ptVal =                                                                            \
            vldrhq_gather_shifted_offset_z_u16(pOrigin, ptOffs, predTail);                                 \
                                                                                                           \
        /* combine 2 predicates set to true if point is in the region & values different from color mask */\
        p = vcmpneq_m_n_u16(ptVal, MaskColour, p);                                                         \
        pGlb |= p;                                                                                         \
        ptVal = vpselq_u16(ptVal, vTarget, p);                                                             \
                                                                                                           \
        /* expand channels */                                                                              \
        __arm_2d_rgb565_unpack_single_vec(ptVal, &R, &G, &B);


/**
  Same as above but use offset compensation during gather load.
  unpack vectors of pixels read from a input 2D coordinates if fits inside the region of
  interest or alternative target pixel if content matches color mask
  (vector of packed pixels & region of interest name implicit and fixed to respectively
  vTarget and ptOrigValidRegion)
  Update global predictor tracking region fit & color mask comparison.
 */

#define __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vecX, vecY, R, G, B, MaskColour, pGlb)                   \
        arm_2d_point_s16x8_t vPoint = {.X = vecX,.Y = vecY };                                              \
        /* set vector predicate if point is inside the region */                                           \
        mve_pred16_t    p =                                                                                \
            arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);                             \
        pGlb |= p;                                                                                         \
        /* prepare vector of point offsets */                                                              \
        int16_t         correctionOffset = vminvq_s16(0x7fff, vPoint.Y) - 1;                               \
        uint16x8_t      ptOffs = vPoint.X + (vPoint.Y - correctionOffset) * iOrigStride;                   \
                                                                                                           \
        /* base pointer update to compensate offset */                                                     \
        uint16_t       *pOriginCorrected = pOrigin + (correctionOffset * iOrigStride);                     \
        /* retrieve all point values */                                                                    \
        uint16x8_t      ptVal =                                                                            \
            vldrhq_gather_shifted_offset_z_u16(pOriginCorrected, ptOffs, predTail);                        \
                                                                                                           \
        /* combine 2 predicates set to true if point is in the region & values different from color mask */\
        p = vcmpneq_m_n_u16(ptVal, MaskColour, p);                                                         \
        pGlb |= p;                                                                                         \
        ptVal = vpselq_u16(ptVal, vTarget, p);                                                             \
                                                                                                           \
        /* expand channels */                                                                              \
        __arm_2d_rgb565_unpack_single_vec(ptVal, &R, &G, &B);


/**
  unpack vectors of 32-bit pixels read from a input 2D coordinates if fits inside the region of
  interest or alternative target pixel if content matches color mask
  16-bit vector processed in 2 parts because of 32-bit requirements, so handles 8 x 32-bit vectors
  (vectors of packed pixels & region of interest name implicit and fixed to respectively
  vTargetLo, vectorHi and ptOrigValidRegion)
  Update 2 global predictors tracking region fit & color mask comparison for 1st and 2nd half.
 */

#define __ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vecX, vecY, R, G, B, MaskColour, pGlbLo, pGlbHi)             \
        arm_2d_point_s16x8_t vPoint = {.X = vecX,.Y = vecY };                                              \
        arm_2d_point_s32x4_t tPointLo, tPointHi;                                                           \
                                                                                                           \
        /* split 16-bit point vector into 2 x 32-bit vectors */                                            \
        vst1q(pscratch16, vPoint.X);                                                                       \
        tPointLo.X = vldrhq_s32(pscratch16);                                                               \
        tPointHi.X = vldrhq_s32(pscratch16 + 4);                                                           \
                                                                                                           \
        vst1q(pscratch16, vPoint.Y);                                                                       \
        tPointLo.Y = vldrhq_s32(pscratch16);                                                               \
        tPointHi.Y = vldrhq_s32(pscratch16 + 4);                                                           \
                                                                                                           \
        /* 1st half */                                                                                     \
                                                                                                           \
        /* set vector predicate if point is inside the region */                                           \
        mve_pred16_t    p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointLo);           \
        /* prepare vector of point offsets */                                                              \
        uint32x4_t      ptOffs = tPointLo.X + tPointLo.Y * iOrigStride;                                    \
                                                                                                           \
        /* retrieve all point values */                                                                    \
        uint32x4_t      ptVal = vldrwq_gather_shifted_offset_z_u32(pOrigin, ptOffs, predTailLow);          \
                                                                                                           \
        /* combine 2 predicates set to true if point is in the region & values different from color mask */\
        p = vcmpneq_m_n_u32(ptVal, MaskColour, p);                                                         \
        pGlbLo |= p;                                                                                       \
        ptVal = vpselq_u32(ptVal, vTargetLo, p);                                                           \
                                                                                                           \
        vst1q(scratch32, ptVal);                                                                           \
                                                                                                           \
        /* 2nd half */                                                                                     \
                                                                                                           \
        /* set vector predicate if point is inside the region */                                           \
        p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointHi);                           \
        /* prepare vector of point offsets */                                                              \
        ptOffs = tPointHi.X + tPointHi.Y * iOrigStride;                                                    \
                                                                                                           \
        /* retrieve all point values */                                                                    \
        ptVal = vldrwq_gather_shifted_offset_z_u32(pOrigin, ptOffs, predTailHigh);                         \
                                                                                                           \
        /* combine 2 predicates set to true if point is in the region & values different from color mask */\
        p = vcmpneq_m_n_u32(ptVal, MaskColour, p);                                                         \
        pGlbHi |= p;                                                                                       \
        ptVal = vpselq_u32(ptVal, vTargetHi, p);                                                           \
                                                                                                           \
        vst1q(scratch32 + 4, ptVal);                                                                       \
                                                                                                           \
        /* expand channels */                                                                              \
        __arm_2d_unpack_rgb888_from_mem((uint8_t *) scratch32, &R, &G, &B);


/**
  Alpha blending of a packed vector with 3 vectors of single R, G & B channels
 */

#define __ARM_2D_BLEND_RGB565_TARGET_RGBVEC(chOpacity, vPackedTarget, vAvgR, vAvgG, vAvgB, vBlended)       \
    uint16x8_t      vTargetR, vTargetG, vTargetB;                                                          \
                                                                                                           \
    __arm_2d_rgb565_unpack_single_vec(vTarget, &vTargetR, &vTargetG, &vTargetB);                           \
                                                                                                           \
    uint16_t        chOpacityCompl = (256 - chOpacity);                                                    \
                                                                                                           \
    /* merge */                                                                                            \
    vAvgR = vAvgR * chOpacityCompl + vTargetR * chOpacity;                                                 \
    vAvgR = vAvgR >> 8;                                                                                    \
                                                                                                           \
    vAvgG = vAvgG * chOpacityCompl + vTargetG * chOpacity;                                                 \
    vAvgG = vAvgG >> 8;                                                                                    \
                                                                                                           \
    vAvgB = vAvgB * chOpacityCompl + vTargetB * chOpacity;                                                 \
    vAvgB = vAvgB >> 8;                                                                                    \
                                                                                                           \
    vBlended = __arm_2d_rgb565_pack_single_vec(vAvgR, vAvgG, vAvgB);


#if     __ARM_2D_HAS_HELIUM_FLOAT__                                             \
    && !__ARM_2D_CFG_FORCED_FIXED_POINT_ROTATION__

#define __CALIB             0.009f16

/**
  Scale Gray8 channel
 */
#define __ARM_2D_SCALE_GRAY8VEC(vAvgPixel, ptVal8, vScal)                           \
        vAvgPixel = vScal * vcvtq_f16_s16(ptVal8);

/**
  Scale Gray8 channel with accumulation
 */
#define __ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vScal)                       \
        vAvgPixel += vScal * vcvtq_f16_s16(ptVal8);

/**
  Scale R, G & B channels
 */
#define __ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vScal)                           \
        vAvgPixelR = vScal * vcvtq_f16_s16(R);                                                              \
        vAvgPixelG = vScal * vcvtq_f16_s16(G);                                                              \
        vAvgPixelB = vScal * vcvtq_f16_s16(B);

/**
  Scale R, G & B channels with accumulation
 */

#define __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vScal)                       \
        vAvgPixelR += vScal * vcvtq_f16_s16(R);                                                             \
        vAvgPixelG += vScal * vcvtq_f16_s16(G);                                                             \
        vAvgPixelB += vScal * vcvtq_f16_s16(B);


static
bool __arm_2d_rotate_regression(arm_2d_size_t * __RESTRICT ptCopySize,
                                    arm_2d_location_t * pSrcPoint,
                                    float fAngle,
                                    arm_2d_location_t * tOffset,
                                    arm_2d_location_t * center,
                                    int32_t             iOrigStride,
                                    arm_2d_rot_linear_regr_t regrCoefs[]
    )
{
    int32_t    iHeight = ptCopySize->iHeight;
    int32_t    iWidth = ptCopySize->iWidth;
    float           invHeight = iHeight > 1 ? 1.0f / (float) (iHeight - 1) : __LARGEINVF32;
    arm_2d_point_s32x4_t vPointCornerI;
    int32x4_t       vCornerX = { 0, 1, 0, 1 };
    int32x4_t       vCornerY = { 0, 0, 1, 1 };
    float           cosAngle = arm_cos_f32(fAngle);
    float           sinAngle = arm_sin_f32(fAngle);
    arm_2d_point_float_t centerf;
    float           slopeX, slopeY;
    bool            gatherLoadIdxOverflow = 0;

    //printf("invHeight %f\n",invHeight);


    centerf.fX = (float) center->iX;
    centerf.fY = (float) center->iY;

    vPointCornerI.X = vdupq_n_s32(pSrcPoint->iX + tOffset->iX);
    vPointCornerI.X = vPointCornerI.X + vmulq_n_s32(vCornerX, (iWidth - 1));

    vPointCornerI.Y = vdupq_n_s32(pSrcPoint->iY + tOffset->iY);
    vPointCornerI.Y = vPointCornerI.Y + vmulq_n_s32(vCornerY, (iHeight - 1));

    /*
        Vector version of:

        int16_t         iX = ptLocation->iX - ptCenter->iX;
        int16_t         iY = ptLocation->iY - ptCenter->iY;

        float           cosAngle = arm_cos_f32(fAngle);
        float           sinAngle = arm_sin_f32(fAngle);

        ptOutBuffer->fY = (iY * cosAngle + iX * sinAngle + ptCenter->iY);
        ptOutBuffer->fX = (-iY * sinAngle + iX * cosAngle + ptCenter->iX);
    */

    arm_2d_point_f32x4_t vTmp, vPointCornerF;

    vTmp.X = vsubq_n_f32(vcvtq_f32_s32(vPointCornerI.X), centerf.fX);
    vTmp.Y = vsubq_n_f32(vcvtq_f32_s32(vPointCornerI.Y), centerf.fY);

    vPointCornerF.X = vmulq_n_f32(vTmp.X, cosAngle) - vmulq_n_f32(vTmp.Y, sinAngle);
    vPointCornerF.X = vaddq_n_f32(vPointCornerF.X, centerf.fX);

    vPointCornerF.Y = vmulq_n_f32(vTmp.X, sinAngle) + vmulq_n_f32(vTmp.Y, cosAngle);
    vPointCornerF.Y = vaddq_n_f32(vPointCornerF.Y, centerf.fY);

    /*
       Check whether rotated index offsets could exceed 16-bit limits
       used in subsequent gather loads
       This will occur for parts of large images (e.g. 320*200)
       To avoid unconditional penalties for small/medium images,
       returns a speculative overflow allowing to handle large offsets.
    */
    float32_t maxY = vmaxnmvq(0.0f, vPointCornerF.Y);

    if((iOrigStride * maxY) > (float)(UINT16_MAX))
        gatherLoadIdxOverflow = true;


    /* interpolation in Y direction for 1st elements column */
    slopeX = (vPointCornerF.X[2] - vPointCornerF.X[0]) * invHeight;
    slopeY = (vPointCornerF.Y[2] - vPointCornerF.Y[0]) * invHeight;

    regrCoefs[0].slopeY = slopeY;
    regrCoefs[0].slopeX = slopeX;
    regrCoefs[0].interceptY = vPointCornerF.Y[0];
    regrCoefs[0].interceptX = vPointCornerF.X[0];


    /* interpolation in Y direction for the last elements column */
    slopeX = (vPointCornerF.X[3] - vPointCornerF.X[1]) * invHeight;
    slopeY = (vPointCornerF.Y[3] - vPointCornerF.Y[1]) * invHeight;

    regrCoefs[1].slopeY = slopeY;
    regrCoefs[1].slopeX = slopeX;
    regrCoefs[1].interceptY = vPointCornerF.Y[1];
    regrCoefs[1].interceptX = vPointCornerF.X[1];

    return gatherLoadIdxOverflow;
}


static
void __arm_2d_impl_gray8_get_pixel_colour(arm_2d_point_f16x8_t
                                                  * ptPoint,
                                                  arm_2d_region_t *
                                                  ptOrigValidRegion,
                                                  uint8_t * pOrigin,
                                                  int16_t iOrigStride,
                                                  uint8_t * pTarget,
                                                  uint8_t MaskColour, uint32_t elts)
{
    mve_pred16_t    predTail = vctp16q(elts);
    uint16x8_t      vTarget = vldrbq_u16(pTarget);

#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) &&  __ARM_2D_HAS_INTERPOLATION_ROTATION__
    float16x8_t     vOne = vdupq_n_f16(1.0f);
    int16x8_t       vXi = vcvtq_s16_f16(ptPoint->X);
    int16x8_t       vYi = vcvtq_s16_f16(ptPoint->Y);

    vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_f16(ptPoint->X, 0));
    vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_f16(ptPoint->Y, 0));

    float16x8_t     vWX = ptPoint->X - vcvtq_f16_s16(vXi);
    float16x8_t     vWY = ptPoint->Y - vcvtq_f16_s16(vYi);

    /* combination of Bottom / Top & Left / Right areas contributions */
    float16x8_t     vAreaTR = vWX * vWY;
    float16x8_t     vAreaTL = (vOne - vWX) * vWY;
    float16x8_t     vAreaBR = vWX * (vOne - vWY);
    float16x8_t     vAreaBL = (vOne - vWX) * (vOne - vWY);

    /* accumulated pixel vectors */
    float16x8_t     vAvgPixel;

    /* predicate accumulator */
    /* tracks all predications conditions for selecting final */
    /* averaged pixed / target pixel */
    /* can probably optimized away since averaged target pixel is */
    /* equivalent to original pixel, but precision limitations would introduce small errors */
    mve_pred16_t    predGlb = 0;

    /*
     * accumulate / average over the 4 neigbouring pixels
     */

    uint16x8_t      ptVal8;
    /* Bottom Left averaging */
    {
        __ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vXi, vYi, ptVal8, MaskColour, predGlb);

        __ARM_2D_SCALE_GRAY8VEC(vAvgPixel, ptVal8, vAreaBL);
    }

    /* Bottom Right averaging */
    {
        __ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vYi, ptVal8, MaskColour, predGlb);

        __ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaBR);
    }

    /* Top Left averaging */
    {
        __ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vXi, vaddq_n_s16(vYi, 1),
                                                 ptVal8, MaskColour, predGlb);

        __ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaTL);
    }

    /* Top Right averaging */
    {
        __ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
                                                 ptVal8, MaskColour, predGlb);

        __ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaTR);
    }


    /* pack */
    uint16x8_t      TempPixel = vcvtq_s16_f16(vAvgPixel);

    /* select between target pixel, averaged pixed */
    vTarget = vpselq_u16(TempPixel, vTarget, predGlb);

#else
    arm_2d_point_s16x8_t vPoint = {
        .X = vcvtq_s16_f16(ptPoint->X),
        .Y = vcvtq_s16_f16(ptPoint->Y)
    };
    /* set vector predicate if point is inside the region */
    mve_pred16_t    p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);

    /* prepare vector of point offsets */
    /* correctionOffset avoid 16-bit overflow */
    int16_t         correctionOffset = vminvq_s16(0x7fff, vPoint.Y) - 1;
    uint16x8_t      ptOffs = vPoint.X + (vPoint.Y - correctionOffset) * iOrigStride;

    /* base pointer update to compensate offset */
    pOrigin += (correctionOffset * iOrigStride);

    /* retrieve all point values */
    uint16x8_t      ptVal = vldrbq_gather_offset_z_u16(pOrigin, ptOffs, predTail);

    /* combine 2 predicates set to true if point is in the region & values different from color mask */
    vTarget = vpselq_u16(ptVal, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));

#endif

    vstrbq_p_u16(pTarget, vTarget, predTail);
}


static
void __arm_2d_impl_gray8_get_pixel_colour_with_alpha(arm_2d_point_f16x8_t * ptPoint,
                                                                arm_2d_region_t * ptOrigValidRegion,
                                                                uint8_t * pOrigin,
                                                                int16_t iOrigStride,
                                                                uint8_t * pTarget,
                                                                uint8_t MaskColour,
                                                                uint8_t chOpacity, uint32_t elts)
{
    mve_pred16_t    predTail = vctp16q(elts);
    uint16x8_t      vTarget = vldrbq_u16(pTarget);

#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) &&  __ARM_2D_HAS_INTERPOLATION_ROTATION__

    float16x8_t     vOne = vdupq_n_f16(1.0f);
    int16x8_t       vXi = vcvtq_s16_f16(ptPoint->X);
    int16x8_t       vYi = vcvtq_s16_f16(ptPoint->Y);

    vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_f16(ptPoint->X, 0));
    vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_f16(ptPoint->Y, 0));

    float16x8_t     vWX = ptPoint->X - vcvtq_f16_s16(vXi);
    float16x8_t     vWY = ptPoint->Y - vcvtq_f16_s16(vYi);

    /* combination of Bottom / Top & Left / Right areas contributions */
    float16x8_t     vAreaTR = vWX * vWY;
    float16x8_t     vAreaTL = (vOne - vWX) * vWY;
    float16x8_t     vAreaBR = vWX * (vOne - vWY);
    float16x8_t     vAreaBL = (vOne - vWX) * (vOne - vWY);

    /* accumulated pixel vectors */
    float16x8_t     vAvgPixel;

    /* predicate accumulator */
    /* tracks all predications conditions for selecting final */
    /* averaged pixed / target pixel */
    /* can probably optimized away since averaged target pixel is */
    /* equivalent to original pixel, but precision limitations would introduce small errors */
    mve_pred16_t    predGlb = 0;

    /*
     * accumulate / average over the 4 neigbouring pixels
     */

    uint16x8_t      ptVal8;
    /* Bottom Left averaging */
    {
        __ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vXi, vYi, ptVal8, MaskColour, predGlb);

        __ARM_2D_SCALE_GRAY8VEC(vAvgPixel, ptVal8, vAreaBL);
    }

    /* Bottom Right averaging */
    {
        __ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vYi,
                                                 ptVal8, MaskColour, predGlb);

        __ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaBR);
    }

    /* Top Left averaging */
    {
        __ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vXi, vaddq_n_s16(vYi, 1),
                                                 ptVal8, MaskColour, predGlb);

        __ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaTL);
    }

    /* Top Right averaging */
    {
        __ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
                                                 ptVal8, MaskColour, predGlb);

        __ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaTR);
    }


    /* blending */
    uint16x8_t      vAvg = vcvtq_s16_f16(vAvgPixel);

    uint16_t        chTransparency = 256 - (chOpacity);
    uint16x8_t      vBlended = (vAvg * chTransparency + vTarget * chOpacity) >> 8;


    /* select between target pixel, averaged pixed */
    vTarget = vpselq_u16(vBlended, vTarget, predGlb);
#else
    /* set vector predicate if point is inside the region */
    arm_2d_point_s16x8_t vPoint = {
        .X = vcvtq_s16_f16(ptPoint->X),
        .Y = vcvtq_s16_f16(ptPoint->Y)
    };
    /* set vector predicate if point is inside the region */
    mve_pred16_t    p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);
    /* prepare vector of point offsets */
    /* correctionOffset avoid 16-bit overflow */
    int16_t         correctionOffset = vminvq_s16(0x7fff, vPoint.Y) - 1;

    uint16x8_t      ptOffs = vPoint.X + (vPoint.Y - correctionOffset) * iOrigStride;

    /* retrieve all point values */
    /* base pointer update to compensate offset */
    pOrigin += (correctionOffset * iOrigStride);

    uint16x8_t      ptVal = vldrbq_gather_offset_z_u16(pOrigin, ptOffs, predTail);

    /* alpha blending */
    uint16_t        chTransparency = 256 - (chOpacity);
    uint16x8_t      vBlended = (ptVal * chTransparency + vTarget * chOpacity) >> 8;


    /* combine 2 predicates, set to true, if point is in the region & values different from color mask */
    vTarget = vpselq_u16(vBlended, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));

#endif

    vstrbq_p_u16(pTarget, vTarget, predTail);
}


static
void __arm_2d_impl_rgb565_get_pixel_colour(arm_2d_point_f16x8_t * ptPoint,
                                           arm_2d_region_t * ptOrigValidRegion,
                                           uint16_t * pOrigin,
                                           int16_t iOrigStride,
                                           uint16_t * pTarget,
                                           uint16_t MaskColour, uint32_t elts)
{
    mve_pred16_t    predTail = vctp16q(elts);
    uint16x8_t      vTarget = vld1q(pTarget);

#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) &&  __ARM_2D_HAS_INTERPOLATION_ROTATION__
    float16x8_t     vOne = vdupq_n_f16(1.0f);
    int16x8_t       vXi = vcvtq_s16_f16(ptPoint->X);
    int16x8_t       vYi = vcvtq_s16_f16(ptPoint->Y);

    vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_f16(ptPoint->X, 0));
    vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_f16(ptPoint->Y, 0));

    float16x8_t     vWX = ptPoint->X - vcvtq_f16_s16(vXi);
    float16x8_t     vWY = ptPoint->Y - vcvtq_f16_s16(vYi);


    /* combination of Bottom / Top & Left / Right areas contributions */
    float16x8_t     vAreaTR = vWX * vWY;
    float16x8_t     vAreaTL = (vOne - vWX) * vWY;
    float16x8_t     vAreaBR = vWX * (vOne - vWY);
    float16x8_t     vAreaBL = (vOne - vWX) * (vOne - vWY);

    /* accumulated pixel vectors */
    float16x8_t     vAvgPixelR, vAvgPixelG, vAvgPixelB;

    /* predicate accumulator */
    /* tracks all predications conditions for selecting final */
    /* averaged pixed / target pixel */
    /* can probably optimized away since averaged target pixel is */
    /* equivalent to original pixel, but precision limitations would introduce small errors */
    mve_pred16_t    predGlb = 0;

    /*
     * accumulate / average over the 4 neigbouring pixels
     */
    uint16x8_t      R, G, B;
    /* Bottom Left averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vXi, vYi, R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
    }


    /* Bottom Right averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vYi, R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
    }

    /* Top Left averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vXi, vaddq_n_s16(vYi, 1), R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
    }

    /* Top Right averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
                                                                R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
    }


    /* pack */
    uint16x8_t      TempPixel = __arm_2d_rgb565_pack_single_vec(vcvtq_s16_f16(vAvgPixelR),
                                                                vcvtq_s16_f16(vAvgPixelG),
                                                                vcvtq_s16_f16(vAvgPixelB));

    /* select between target pixel, averaged pixed */
    vTarget = vpselq_u16(TempPixel, vTarget, predGlb);

#else
    arm_2d_point_s16x8_t vPoint = {
            .X = vcvtq_s16_f16(ptPoint->X),
            .Y = vcvtq_s16_f16(ptPoint->Y) };

    /* set vector predicate if point is inside the region */
    mve_pred16_t    p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);
    /* prepare vector of point offsets */
    uint16x8_t      ptOffs = vPoint.X + vPoint.Y * iOrigStride;
    /* retrieve all point values */
    uint16x8_t      ptVal = vldrhq_gather_shifted_offset_z_u16(pOrigin, ptOffs, predTail);

    /* combine 2 predicates set to true if point is in the region & values different from color mask */
    vTarget = vpselq_u16(ptVal, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));

#endif

    /* update target pixels */
    vst1q_p(pTarget, vTarget, predTail);
}


static
void __arm_2d_impl_rgb565_get_pixel_colour_offs_compensated(arm_2d_point_f16x8_t *
                                                            ptPoint,
                                                            arm_2d_region_t *
                                                            ptOrigValidRegion,
                                                            uint16_t * pOrigin,
                                                            int16_t iOrigStride,
                                                            uint16_t * pTarget,
                                                            uint16_t MaskColour,
                                                            uint32_t elts)
{
    mve_pred16_t    predTail = vctp16q(elts);
    uint16x8_t      vTarget = vld1q(pTarget);

#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) &&  __ARM_2D_HAS_INTERPOLATION_ROTATION__
    float16x8_t     vOne = vdupq_n_f16(1.0f);
    int16x8_t       vXi = vcvtq_s16_f16(ptPoint->X);
    int16x8_t       vYi = vcvtq_s16_f16(ptPoint->Y);

    vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_f16(ptPoint->X, 0));
    vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_f16(ptPoint->Y, 0));

    float16x8_t     vWX = ptPoint->X - vcvtq_f16_s16(vXi);
    float16x8_t     vWY = ptPoint->Y - vcvtq_f16_s16(vYi);

    /* combination of Bottom / Top & Left / Right areas contributions */
    float16x8_t     vAreaTR = vWX * vWY;
    float16x8_t     vAreaTL = (vOne - vWX) * vWY;
    float16x8_t     vAreaBR = vWX * (vOne - vWY);
    float16x8_t     vAreaBL = (vOne - vWX) * (vOne - vWY);

    /* accumulated pixel vectors */
    float16x8_t     vAvgPixelR, vAvgPixelG, vAvgPixelB;

    /* predicate accumulator */
    /* tracks all predications conditions for selecting final */
    /* averaged pixed / target pixel */
    /* can probably optimized away since averaged target pixel is */
    /* equivalent to original pixel, but precision limitations would introduce small errors */
    mve_pred16_t    predGlb = 0;

    /*
     * accumulate / average over the 4 neigbouring pixels
     */

    uint16x8_t      R, G, B;
    /* Bottom Left averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vXi, vYi, R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
    }

    /* Bottom Right averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vYi,
            R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
    }

    /* Top Left averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vXi, vaddq_n_s16(vYi, 1),
                R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
    }

    /* Top Right averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
                R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
    }


    /* pack */
    uint16x8_t      TempPixel = __arm_2d_rgb565_pack_single_vec(vcvtq_s16_f16(vAvgPixelR),
                                                                vcvtq_s16_f16(vAvgPixelG),
                                                                vcvtq_s16_f16(vAvgPixelB));

    /* select between target pixel, averaged pixed */
    vTarget = vpselq_u16(TempPixel, vTarget, predGlb);

#else
    arm_2d_point_s16x8_t vPoint = {
            .X = vcvtq_s16_f16(ptPoint->X),
            .Y = vcvtq_s16_f16(ptPoint->Y) };
    /* set vector predicate if point is inside the region */
    mve_pred16_t    p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);

    /* prepare vector of point offsets */
    /* correctionOffset avoid 16-bit overflow */
    int16_t         correctionOffset = vminvq_s16(0x7fff, vPoint.Y) - 1;
    uint16x8_t      ptOffs = vPoint.X + (vPoint.Y - correctionOffset) * iOrigStride;

    /* base pointer update to compensate offset */
    pOrigin += (correctionOffset * iOrigStride);

    /* retrieve all point values */
    uint16x8_t      ptVal = vldrhq_gather_shifted_offset_z_u16(pOrigin, ptOffs, predTail);

    /* combine 2 predicates set to true if point is in the region & values different from color mask */
    vTarget = vpselq_u16(ptVal, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));

#endif

    vst1q_p(pTarget, vTarget, predTail);
}

static
void __arm_2d_impl_rgb565_get_pixel_colour_with_alpha(
                                            arm_2d_point_f16x8_t    *ptPoint,
                                            arm_2d_region_t         *ptOrigValidRegion,
                                            uint16_t                *pOrigin,
                                            int16_t                  iOrigStride,
                                            uint16_t                *pTarget,
                                            uint16_t                 MaskColour,
                                            uint8_t                  chOpacity,
                                            uint32_t                 elts)
{
    mve_pred16_t    predTail = vctp16q(elts);
    uint16x8_t      vTarget = vld1q(pTarget);

#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) &&  __ARM_2D_HAS_INTERPOLATION_ROTATION__

    float16x8_t     vOne = vdupq_n_f16(1.0f);
    int16x8_t       vXi = vcvtq_s16_f16(ptPoint->X);
    int16x8_t       vYi = vcvtq_s16_f16(ptPoint->Y);

    vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_f16(ptPoint->X, 0));
    vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_f16(ptPoint->Y, 0));

    float16x8_t     vWX = ptPoint->X - vcvtq_f16_s16(vXi);
    float16x8_t     vWY = ptPoint->Y - vcvtq_f16_s16(vYi);

    /* combination of Bottom / Top & Left / Right areas contributions */
    float16x8_t     vAreaTR = vWX * vWY;
    float16x8_t     vAreaTL = (vOne - vWX) * vWY;
    float16x8_t     vAreaBR = vWX * (vOne - vWY);
    float16x8_t     vAreaBL = (vOne - vWX) * (vOne - vWY);

    /* accumulated pixel vectors */
    float16x8_t     vAvgPixelR, vAvgPixelG, vAvgPixelB;

    /* predicate accumulator */
    /* tracks all predications conditions for selecting final */
    /* averaged pixed / target pixel */
    /* can probably optimized away since averaged target pixel is */
    /* equivalent to original pixel, but precision limitations would introduce small errors */
    mve_pred16_t    predGlb = 0;

    /*
     * accumulate / average over the 4 neigbouring pixels
     */

    uint16x8_t      R, G, B;
    /* Bottom Left averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vXi, vYi, R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
    }


    /* Bottom Right averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vYi, R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
    }

    /* Top Left averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vXi, vaddq_n_s16(vYi, 1), R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
    }

    /* Top Right averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
                                                                R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
    }


    /* blending */
    uint16x8_t      vAvgR, vAvgG, vAvgB;

    vAvgR = vcvtq_s16_f16(vAvgPixelR);
    vAvgG = vcvtq_s16_f16(vAvgPixelG);
    vAvgB = vcvtq_s16_f16(vAvgPixelB);

    uint16x8_t      vBlended;

    __ARM_2D_BLEND_RGB565_TARGET_RGBVEC(chOpacity, vTarget, vAvgR, vAvgG, vAvgB, vBlended);

    /* select between target pixel, averaged pixed */
    vTarget = vpselq_u16(vBlended, vTarget, predGlb);

#else
    /* set vector predicate if point is inside the region */
    arm_2d_point_s16x8_t vPoint = {
            .X = vcvtq_s16_f16(ptPoint->X),
            .Y = vcvtq_s16_f16(ptPoint->Y) };

    mve_pred16_t    p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);
    /* prepare vector of point offsets */
    uint16x8_t      ptOffs = vPoint.X + vPoint.Y * iOrigStride;
    /* retrieve all point values */
    uint16x8_t      ptVal = vldrhq_gather_shifted_offset_z_u16(pOrigin, ptOffs, predTail);

    /* alpha blending */
    uint16x8_t      vBlended =
        __arm_2d_rgb565_alpha_blending_single_vec(ptVal, vTarget, chOpacity);

    /* combine 2 predicates, set to true, if point is in the region & values different from color mask */
    vTarget = vpselq_u16(vBlended, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));
#endif

    vst1q_p(pTarget, vTarget, predTail);
}


static
void __arm_2d_impl_rgb565_get_pixel_colour_with_alpha_offs_compensated(
                                            arm_2d_point_f16x8_t    *ptPoint,
                                            arm_2d_region_t         *ptOrigValidRegion,
                                            uint16_t                *pOrigin,
                                            int16_t                  iOrigStride,
                                            uint16_t                *pTarget,
                                            uint16_t                 MaskColour,
                                            uint8_t                  chOpacity,
                                            uint32_t                 elts)
{
    mve_pred16_t    predTail = vctp16q(elts);
    uint16x8_t      vTarget = vld1q(pTarget);

#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) &&  __ARM_2D_HAS_INTERPOLATION_ROTATION__

    float16x8_t     vOne = vdupq_n_f16(1.0f);
    int16x8_t       vXi = vcvtq_s16_f16(ptPoint->X);
    int16x8_t       vYi = vcvtq_s16_f16(ptPoint->Y);

    vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_f16(ptPoint->X, 0));
    vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_f16(ptPoint->Y, 0));

    float16x8_t     vWX = ptPoint->X - vcvtq_f16_s16(vXi);
    float16x8_t     vWY = ptPoint->Y - vcvtq_f16_s16(vYi);

    /* combination of Bottom / Top & Left / Right areas contributions */
    float16x8_t     vAreaTR = vWX * vWY;
    float16x8_t     vAreaTL = (vOne - vWX) * vWY;
    float16x8_t     vAreaBR = vWX * (vOne - vWY);
    float16x8_t     vAreaBL = (vOne - vWX) * (vOne - vWY);

    /* accumulated pixel vectors */
    float16x8_t     vAvgPixelR, vAvgPixelG, vAvgPixelB;

    /* predicate accumulator */
    /* tracks all predications conditions for selecting final */
    /* averaged pixed / target pixel */
    /* can probably optimized away since averaged target pixel is */
    /* equivalent to original pixel, but precision limitations would introduce small errors */
    mve_pred16_t    predGlb = 0;

    /*
     * accumulate / average over the 4 neigbouring pixels
     */

    uint16x8_t      R, G, B;
    /* Bottom Left averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vXi, vYi, R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
    }


    /* Bottom Right averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vYi, R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
    }

    /* Top Left averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vXi, vaddq_n_s16(vYi, 1), R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
    }

    /* Top Right averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
                                                                R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
    }


    /* blending */
    uint16x8_t      vAvgR, vAvgG, vAvgB;

    vAvgR = vcvtq_s16_f16(vAvgPixelR);
    vAvgG = vcvtq_s16_f16(vAvgPixelG);
    vAvgB = vcvtq_s16_f16(vAvgPixelB);

    uint16x8_t      vBlended;

    __ARM_2D_BLEND_RGB565_TARGET_RGBVEC(chOpacity, vTarget, vAvgR, vAvgG, vAvgB, vBlended);

    /* select between target pixel, averaged pixed */
    vTarget = vpselq_u16(vBlended, vTarget, predGlb);
#else
    /* set vector predicate if point is inside the region */
    arm_2d_point_s16x8_t vPoint = {
            .X = vcvtq_s16_f16(ptPoint->X),
            .Y = vcvtq_s16_f16(ptPoint->Y) };
    /* set vector predicate if point is inside the region */
    mve_pred16_t    p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);
    /* prepare vector of point offsets */
    /* correctionOffset avoid 16-bit overflow */
    int16_t         correctionOffset = vminvq_s16(0x7fff, vPoint.Y) - 1;

    uint16x8_t      ptOffs =
        vPoint.X + (vPoint.Y - correctionOffset) * iOrigStride;

    /* retrieve all point values */
    /* base pointer update to compensate offset */
    pOrigin += (correctionOffset * iOrigStride);

    uint16x8_t      ptVal = vldrhq_gather_shifted_offset_z_u16(pOrigin, ptOffs, predTail);


    /* alpha blending */
    uint16x8_t      vBlended =
        __arm_2d_rgb565_alpha_blending_single_vec(ptVal, vTarget, chOpacity);

    /* combine 2 predicates, set to true, if point is in the region & values different from color mask */
    vTarget = vpselq_u16(vBlended, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));

#endif

    vst1q_p(pTarget, vTarget, predTail);
}


static
void __arm_2d_impl_cccn888_get_pixel_colour(   arm_2d_point_f16x8_t *ptPoint,
                                            arm_2d_region_t *ptOrigValidRegion,
                                            uint32_t *pOrigin,
                                            int16_t iOrigStride,
                                            uint32_t *pTarget,
                                            uint32_t MaskColour,
                                            int16_t elts)
{
#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) &&  __ARM_2D_HAS_INTERPOLATION_ROTATION__
    ARM_ALIGN(8) uint32_t scratch32[32];
    int16_t        *pscratch16 = (int16_t *) scratch32;
    uint32x4_t      vTargetLo = vld1q(pTarget);
    uint32x4_t      vTargetHi = vld1q(pTarget + 4);
    mve_pred16_t    predTailLow = vctp32q(elts);
    mve_pred16_t    predTailHigh = elts - 4 > 0 ? vctp32q(elts - 4) : 0;
    float16x8_t     vOne = vdupq_n_f16(1.0f);

    int16x8_t       vXi = vcvtq_s16_f16(ptPoint->X);
    int16x8_t       vYi = vcvtq_s16_f16(ptPoint->Y);

    vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_f16(ptPoint->X, 0));
    vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_f16(ptPoint->Y, 0));

    float16x8_t     vWX = ptPoint->X - vcvtq_f16_s16(vXi);
    float16x8_t     vWY = ptPoint->Y - vcvtq_f16_s16(vYi);

    /* combination of Bottom / Top & Left / Right areas contributions */
    float16x8_t     vAreaTR = vWX * vWY;
    float16x8_t     vAreaTL = (vOne - vWX) * vWY;
    float16x8_t     vAreaBR = vWX * (vOne - vWY);
    float16x8_t     vAreaBL = (vOne - vWX) * (vOne - vWY);

    /* accumulated pixel vectors */
    float16x8_t     vAvgPixelR, vAvgPixelG, vAvgPixelB;

    /* predicate accumulators */
    /* tracks all predications conditions for selecting final */
    /* averaged pixed / target pixel */
    /* can probably optimized away since averaged target pixel is */
    /* equivalent to original pixel, but precision limitations would introduce small errors */
    mve_pred16_t    predGlbLo = 0, predGlbHi = 0;

    /*
     * accumulate / average over the 4 neigbouring pixels
     */

    uint16x8_t      R, G, B;

    /* Bottom Left averaging */
    {
        __ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vXi, vYi, R, G, B, MaskColour, predGlbLo, predGlbHi);

        __ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
    }

    /* Bottom Right averaging */
    {
        __ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vYi,
                            R, G, B, MaskColour, predGlbLo, predGlbHi);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
    }

    /* Top Left averaging */
    {
        __ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vXi, vaddq_n_s16(vYi, 1),
                                                R, G, B, MaskColour, predGlbLo, predGlbHi);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
    }

    /* Top Right averaging */
    {
        __ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
                                                R, G, B, MaskColour, predGlbLo, predGlbHi);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
    }

    /* pack */
    __arm_2d_pack_rgb888_to_mem((uint8_t *) scratch32, vcvtq_s16_f16(vAvgPixelR),
                                vcvtq_s16_f16(vAvgPixelG), vcvtq_s16_f16(vAvgPixelB));

    uint32x4_t      TempPixel = vld1q(scratch32);

    /* select between target pixel, averaged pixed */
    TempPixel = vpselq_u32(TempPixel, vTargetLo, predGlbLo);

    vst1q_p(pTarget, TempPixel, predTailLow);

    TempPixel = vld1q(scratch32 + 4);

    /* select between target pixel, averaged pixed */
    TempPixel = vpselq_u32(TempPixel, vTargetHi, predGlbHi);

    vst1q_p(pTarget + 4, TempPixel, predTailHigh);
#else

    arm_2d_point_s32x4_t    tPointLo, tPointHi;
    ARM_ALIGN(8)  int16_t   scratch[8];
    mve_pred16_t            p;

    /* split 16-bit point vector into 2 x 32-bit vectors */
    vst1q(scratch, vcvtq_s16_f16(ptPoint->X));
    tPointLo.X = vldrhq_s32(scratch);
    tPointHi.X = vldrhq_s32(scratch + 4);

    vst1q(scratch, vcvtq_s16_f16(ptPoint->Y));
    tPointLo.Y = vldrhq_s32(scratch);
    tPointHi.Y = vldrhq_s32(scratch + 4);

    /* 1st half */

    /* set vector predicate if point is inside the region */
    p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointLo);
    /* prepare vector of point offsets */
    uint32x4_t      ptOffs = tPointLo.X + tPointLo.Y * iOrigStride;
    uint32x4_t      vPixel = vld1q(pTarget);
    /* retrieve all point values */
    uint32x4_t      ptVal = vldrwq_gather_shifted_offset_u32(pOrigin, ptOffs);

    /* combine 2 predicates set to true if point is in the region & values different from color mask */
    vPixel = vpselq_u32(ptVal, vPixel, vcmpneq_m_n_u32(ptVal, MaskColour, p));

    vst1q_p(pTarget, vPixel, vctp32q(elts));

    elts -= 4;
    if (elts > 0) {

        /* second half */
        p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointHi);
        ptOffs = tPointHi.X + tPointHi.Y * iOrigStride;
        vPixel = vld1q(pTarget + 4);

        ptVal = vldrwq_gather_shifted_offset_u32(pOrigin, ptOffs);
        vPixel = vpselq_u32(ptVal, vPixel, vcmpneq_m_n_u32(ptVal, MaskColour, p));
        vst1q_p(pTarget + 4, vPixel, vctp32q(elts));
    }
#endif
}


static
void __arm_2d_impl_cccn888_get_pixel_colour_with_alpha(
                                            arm_2d_point_f16x8_t    *ptPoint,
                                            arm_2d_region_t         *ptOrigValidRegion,
                                            uint32_t                *pOrigin,
                                            int16_t                  iOrigStride,
                                            uint32_t                *pTarget,
                                            uint32_t                 MaskColour,
                                            uint8_t                  chOpacity,
                                            int16_t                  elts)
{
#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__)  &&  __ARM_2D_HAS_INTERPOLATION_ROTATION__
    ARM_ALIGN(8) uint32_t scratch32[32];
    int16_t        *pscratch16 = (int16_t *) scratch32;
    uint32x4_t      vTargetLo = vld1q(pTarget);
    uint32x4_t      vTargetHi = vld1q(pTarget + 4);
    mve_pred16_t    predTailLow = vctp32q(elts);
    mve_pred16_t    predTailHigh = elts - 4 > 0 ? vctp32q(elts - 4) : 0;
    float16x8_t     vOne = vdupq_n_f16(1.0f);

    int16x8_t       vXi = vcvtq_s16_f16(ptPoint->X);
    int16x8_t       vYi = vcvtq_s16_f16(ptPoint->Y);

    vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_f16(ptPoint->X, 0));
    vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_f16(ptPoint->Y, 0));

    float16x8_t     vWX = ptPoint->X - vcvtq_f16_s16(vXi);
    float16x8_t     vWY = ptPoint->Y - vcvtq_f16_s16(vYi);

    /* combination of Bottom / Top & Left / Right areas contributions */
    float16x8_t     vAreaTR = vWX * vWY;
    float16x8_t     vAreaTL = (vOne - vWX) * vWY;
    float16x8_t     vAreaBR = vWX * (vOne - vWY);
    float16x8_t     vAreaBL = (vOne - vWX) * (vOne - vWY);

    /* accumulated pixel vectors */
    float16x8_t     vAvgPixelR, vAvgPixelG, vAvgPixelB;


    /* predicate accumulators */
    /* tracks all predications conditions for selecting final */
    /* averaged pixed / target pixel */
    /* can probably optimized away since averaged target pixel is */
    /* equivalent to original pixel, but precision limitations would introduce small errors */
    mve_pred16_t    predGlbLo = 0, predGlbHi = 0;

    /*
     * accumulate / average over the 4 neigbouring pixels
     */
    uint16x8_t      R, G, B;

    /* Bottom Left averaging */
    {
        __ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vXi, vYi, R, G, B, MaskColour, predGlbLo, predGlbHi);

        __ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
    }

    /* Bottom Right averaging */
    {
        __ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vYi,
                                            R, G, B, MaskColour, predGlbLo, predGlbHi);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
    }

    /* Top Left averaging */
    {
        __ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vXi, vaddq_n_s16(vYi, 1),
                                        R, G, B, MaskColour, predGlbLo, predGlbHi);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
    }

    /* Top Right averaging */
    {
        __ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
            R, G, B, MaskColour, predGlbLo, predGlbHi);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
    }


    /* alpha blending */
    uint16x8_t      vTargetR, vTargetG, vTargetB;

    __arm_2d_unpack_rgb888_from_mem((const uint8_t *) pTarget, &vTargetR, &vTargetG, &vTargetB);

    uint16_t        chOpacityCompl = (256 - chOpacity);

    uint16x8_t      vAvgR, vAvgG, vAvgB;

    vAvgR = vcvtq_s16_f16(vAvgPixelR);
    vAvgG = vcvtq_s16_f16(vAvgPixelG);
    vAvgB = vcvtq_s16_f16(vAvgPixelB);

    /* merge */
    vAvgR = vAvgR * chOpacityCompl + vTargetR * chOpacity;
    vAvgR = vAvgR >> 8;

    vAvgG = vAvgG * chOpacityCompl + vTargetG * chOpacity;
    vAvgG = vAvgG >> 8;

    vAvgB = vAvgB * chOpacityCompl + vTargetB * chOpacity;
    vAvgB = vAvgB >> 8;


    /* pack */
    __arm_2d_pack_rgb888_to_mem((uint8_t *) scratch32, vAvgR, vAvgG, vAvgB);

    uint32x4_t      TempPixel = vld1q(scratch32);

    /* select between target pixel, averaged pixed */
    TempPixel = vpselq_u32(TempPixel, vTargetLo, predGlbLo);

    vst1q_p(pTarget, TempPixel, predTailLow);

    TempPixel = vld1q(scratch32 + 4);

    /* select between target pixel, averaged pixed */
    TempPixel = vpselq_u32(TempPixel, vTargetHi, predGlbHi);

    vst1q_p(pTarget + 4, TempPixel, predTailHigh);
#else
    arm_2d_point_s32x4_t    tPointLo, tPointHi;
    ARM_ALIGN(8) int16_t          scratch[8];
    ARM_ALIGN(8) uint32_t         blendled[4];
    mve_pred16_t            p;

    /* split 16-bit point vector into 2 x 32-bit vectors */
    vst1q(scratch, vcvtq_s16_f16(ptPoint->X));
    tPointLo.X = vldrhq_s32(scratch);
    tPointHi.X = vldrhq_s32(scratch + 4);

    vst1q(scratch, vcvtq_s16_f16(ptPoint->Y));
    tPointLo.Y = vldrhq_s32(scratch);
    tPointHi.Y = vldrhq_s32(scratch + 4);


    /* 1st half */

    /* set vector predicate if point is inside the region */
    p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointLo);
    /* prepare vector of point offsets */
    uint32x4_t      ptOffs = tPointLo.X + tPointLo.Y * iOrigStride;
    uint32x4_t      vPixel = vld1q(pTarget);
    /* retrieve all point values */
    uint32x4_t      ptVal = vldrwq_gather_shifted_offset_u32(pOrigin, ptOffs);

    vstrwq_u32((uint32_t *) scratch, ptVal);

    /* alpha-blending (requires widened inputs) */
    vstrbq_u16((uint8_t *) blendled,
               __rgb888_alpha_blending_direct_single_vec(vldrbq_u16((uint8_t const *) scratch),
                                                         vldrbq_u16((uint8_t const *) pTarget), chOpacity));

    vstrbq_u16((uint8_t *) blendled + 2,
               __rgb888_alpha_blending_direct_single_vec(vldrbq_u16((uint8_t const *)scratch + 4),
                                                         vldrbq_u16((uint8_t const *)pTarget + 2), chOpacity));

    uint32x4_t      vBlended = vld1q(blendled);

    /* combine 2 predicates, set to true, if point is in the region & values different from color mask */
    vPixel = vpselq_u32(vBlended, vPixel, vcmpneq_m_n_u32(ptVal, MaskColour, p));

    vst1q_p(pTarget, vPixel, vctp32q(elts));

    elts -= 4;
    if(elts > 0) {
        /* second half */

        p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointHi);
        ptOffs = tPointHi.X + tPointHi.Y * iOrigStride;
        vPixel = vld1q(pTarget);
        ptVal = vldrwq_gather_shifted_offset_u32(pOrigin, ptOffs);

        vstrwq_u32((uint32_t *) scratch, ptVal);

        /* alpha-blending (requires widened inputs) */
        vstrbq_u16((uint8_t *) blendled,
                   __rgb888_alpha_blending_direct_single_vec(vldrbq_u16((uint8_t const *) scratch),
                                                             vldrbq_u16((uint8_t const *) pTarget), chOpacity));
        vstrbq_u16((uint8_t *) blendled + 2,
                   __rgb888_alpha_blending_direct_single_vec(vldrbq_u16((uint8_t const *)scratch + 4),
                                                             vldrbq_u16((uint8_t const *)pTarget + 2), chOpacity));

        vBlended = vld1q(blendled);

        /* combine 2 predicates, set to true, if point is in the region & values different from color mask */
        vPixel = vpselq_u32(vBlended, vPixel, vcmpneq_m_n_u32(ptVal, MaskColour, p));

        vst1q_p(pTarget + 4, vPixel, vctp32q(elts));
    }
#endif
}

#else /* __ARM_2D_HAS_HELIUM_FLOAT__ && ! __ARM_2D_CFG_FORCED_FIXED_POINT_ROTATION__ */


/* extra calibration removed in fixed-point code since offset is lower than Q9.6 representation */

#define ONE_BY_2PI_Q31      341782637.0f
#define ARSHIFT(x, shift)   (shift > 0 ? x >> shift : x << (-shift))
#define TO_Q16(x)           ((x) << 16)
#define GET_Q6INT(x)        ((x) >> 6)
#define SET_Q6INT(x)        ((x) << 6)

/**
  Scale Gray8 channel
 */
#define __ARM_2D_SCALE_GRAY8VEC(vAvgPixel, ptVal8, vScal)                                \
        vAvgPixel = vmulhq_u16(vScal, ptVal8);

/**
  Scale Gray8 channel with accumulation
 */
#define __ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vScal)                            \
        vAvgPixel += vmulhq_u16(vScal, ptVal8);

/**
  Scale R, G & B channels
 */
#define __ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vScal)        \
        vAvgPixelR = vmulhq_u16(vScal, R);                                               \
        vAvgPixelG = vmulhq_u16(vScal, G);                                               \
        vAvgPixelB = vmulhq_u16(vScal, B);
/**
  Scale R, G & B channels with accumulation
 */
#define __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vScal)    \
        vAvgPixelR += vmulhq_u16(vScal, R);                                              \
        vAvgPixelG += vmulhq_u16(vScal, G);                                              \
        vAvgPixelB += vmulhq_u16(vScal, B);


#ifdef VECTORIZED_ROTATION_REGR
/* disabled as slower than scalar */
static
bool __arm_2d_rotate_regression(arm_2d_size_t * __RESTRICT ptCopySize,
                                    arm_2d_location_t * pSrcPoint,
                                    float fAngle,
                                    arm_2d_location_t * tOffset,
                                    arm_2d_location_t * center,
                                    int32_t             iOrigStride,
                                    arm_2d_rot_linear_regr_t regrCoefs[]
    )
{
    int32_t         iHeight = ptCopySize->iHeight;
    int32_t         iWidth = ptCopySize->iWidth;
    q31_t           invHeightFx = 0x7fffffff / (iHeight - 1);
    arm_2d_point_s32x4_t vPointCornerI;
    int32_t         AngleFx = (int32_t) roundf(fAngle * ONE_BY_2PI_Q31);
    q31_t           cosAngleFx = arm_cos_q31(AngleFx);
    q31_t           sinAngleFx = arm_sin_q31(AngleFx);
    int32x4_t       vCornerX = { 0, 1, 0, 1 };
    int32x4_t       vCornerY = { 0, 0, 1, 1 };
    bool            gatherLoadIdxOverflow = 0;

    vPointCornerI.X = vdupq_n_s32(pSrcPoint->iX + tOffset->iX);
    vPointCornerI.X = vPointCornerI.X + vmulq_n_s32(vCornerX, (iWidth - 1));

    vPointCornerI.Y = vdupq_n_s32(pSrcPoint->iY + tOffset->iY);
    vPointCornerI.Y = vPointCornerI.Y + vmulq_n_s32(vCornerY, (iHeight - 1));

    /*
       Vector version of:

       int16_t         iX = ptLocation->iX - ptCenter->iX;
       int16_t         iY = ptLocation->iY - ptCenter->iY;

       q31_t           cosAngleFx = arm_cos_q31(fAngle);
       q31_t           sinAngleFx = arm_sin_q31(fAngle);
       tPointCornerFx[0][0].Y =
       __QDADD(__QDADD(centerQ16.Y, MULTFX(iYQ16, cosAngleFx)), MULTFX(iXQ16, sinAngleFx));
       tPointCornerFx[0][0].X =
       __QDSUB(__QDADD(centerQ16.X, MULTFX(iXQ16, cosAngleFx)), MULTFX(iYQ16, sinAngleFx));

     */

    arm_2d_point_s32x4_t vTmp1;

    vTmp1.X = vsubq_n_s16(vPointCornerI.X, center->iX);
    vTmp1.Y = vsubq_n_s16(vPointCornerI.Y, center->iY);
    vTmp1.X <<= 16;
    vTmp1.Y <<= 16;


    vPointCornerI.X =
        vqsubq(vqdmulhq_n_s32(vTmp1.X, cosAngleFx), vqdmulhq_n_s32(vTmp1.Y, sinAngleFx));
    vPointCornerI.X = vqaddq_n_s32(vPointCornerI.X, (center->iX << 16));

    vPointCornerI.Y = vqdmlahq(vqdmulhq_n_s32(vTmp1.X, sinAngleFx), vTmp1.Y, cosAngleFx);
    vPointCornerI.Y = vqaddq_n_s32(vPointCornerI.Y, (center->iY << 16));

    /*
       Check whether rotated index offsets could exceed 16-bit limits
       used in subsequent gather loads
       This will occur for parts of large images (e.g. 320*200)
       To avoid unconditional penalties for small/medium images,
       returns a speculative overflow allowing to handle large offsets.
    */
    int32_t maxY = vmaxvq(0.0f, vPointCornerI.Y);

    if(MULTFX(TO_Q16(iOrigStride), maxY) > UINT16_MAX)
        gatherLoadIdxOverflow = true;


    /* regression parameters */

    vTmp1.X[0] = vPointCornerI.X[0];
    vTmp1.X[1] = vPointCornerI.X[1];
    vTmp1.X[2] = vPointCornerI.Y[0];
    vTmp1.X[3] = vPointCornerI.Y[1];

    vTmp1.Y[0] = vPointCornerI.X[2];
    vTmp1.Y[1] = vPointCornerI.X[3];
    vTmp1.Y[2] = vPointCornerI.Y[2];
    vTmp1.Y[3] = vPointCornerI.Y[3];

    /* slopes */
    vTmp1.X = vqdmulhq_n_s32(vTmp1.Y - vTmp1.X, invHeightFx);

    regrCoefs[0].slopeY = vTmp1.X[2];
    regrCoefs[0].slopeX = vTmp1.X[0];
    regrCoefs[0].interceptY = vPointCornerI.Y[0];
    regrCoefs[0].interceptX = vPointCornerI.X[0];

    regrCoefs[1].slopeY = vTmp1.X[3];
    regrCoefs[1].slopeX = vTmp1.X[1];
    regrCoefs[1].interceptY = vPointCornerI.Y[1];
    regrCoefs[1].interceptX = vPointCornerI.X[1];

    return gatherLoadIdxOverflow;
}

#else

static
bool __arm_2d_rotate_regression(arm_2d_size_t * __RESTRICT ptCopySize,
                                            arm_2d_location_t * pSrcPoint,
                                            float fAngle,
                                            arm_2d_location_t * tOffset,
                                            arm_2d_location_t * center,
                                            int32_t             iOrigStride,
                                            arm_2d_rot_linear_regr_t regrCoefs[]
    )
{
    int_fast16_t        iHeight = ptCopySize->iHeight;
    int_fast16_t        iWidth = ptCopySize->iWidth;
    q31_t               invHeightFx = 0x7fffffff / (iHeight - 1);
    int32_t             AngleFx = lroundf(fAngle * ONE_BY_2PI_Q31);
    q31_t               cosAngleFx = arm_cos_q31(AngleFx);
    q31_t               sinAngleFx = arm_sin_q31(AngleFx);
    arm_2d_point_fx_t   tPointCornerFx[2][2];
    arm_2d_point_fx_t   centerQ16;
    arm_2d_point_fx_t   srcPointQ16;
    arm_2d_point_fx_t   tOffsetQ16;
    arm_2d_point_fx_t   tmp;
    int32_t             iXQ16, iYQ16;
    bool                gatherLoadIdxOverflow = 0;

    /* Q16 conversion */
    centerQ16.X = TO_Q16(center->iX);
    centerQ16.Y = TO_Q16(center->iY);

    srcPointQ16.X = TO_Q16(pSrcPoint->iX);
    srcPointQ16.Y = TO_Q16(pSrcPoint->iY);

    tOffsetQ16.X = TO_Q16(tOffset->iX);
    tOffsetQ16.Y = TO_Q16(tOffset->iY);


    /* (0,0) corner */
    tmp.X = srcPointQ16.X + 0 + tOffsetQ16.X;
    tmp.Y = srcPointQ16.Y + 0 + tOffsetQ16.Y;

    iXQ16 = tmp.X - centerQ16.X;
    iYQ16 = tmp.Y - centerQ16.Y;

    tPointCornerFx[0][0].Y =
        __QDADD(__QDADD(centerQ16.Y, MULTFX(iYQ16, cosAngleFx)), MULTFX(iXQ16, sinAngleFx));
    tPointCornerFx[0][0].X =
        __QDSUB(__QDADD(centerQ16.X, MULTFX(iXQ16, cosAngleFx)), MULTFX(iYQ16, sinAngleFx));


    /* ((iWidth - 1),0) corner */
    tmp.X = srcPointQ16.X + 0 + tOffsetQ16.X + TO_Q16(iWidth - 1);
    iXQ16 = tmp.X - centerQ16.X;

    tPointCornerFx[1][0].Y =
        __QDADD(__QDADD(centerQ16.Y, MULTFX(iYQ16, cosAngleFx)), MULTFX(iXQ16, sinAngleFx));
    tPointCornerFx[1][0].X =
        __QDSUB(__QDADD(centerQ16.X, MULTFX(iXQ16, cosAngleFx)), MULTFX(iYQ16, sinAngleFx));


    /* ((iWidth - 1),(iHeight - 1)) corner */
    tmp.Y = srcPointQ16.Y + tOffsetQ16.Y + TO_Q16(iHeight - 1);
    iYQ16 = tmp.Y - centerQ16.Y;

    tPointCornerFx[1][1].Y =
        __QDADD(__QDADD(centerQ16.Y, MULTFX(iYQ16, cosAngleFx)), MULTFX(iXQ16, sinAngleFx));
    tPointCornerFx[1][1].X =
        __QDSUB(__QDADD(centerQ16.X, MULTFX(iXQ16, cosAngleFx)), MULTFX(iYQ16, sinAngleFx));


    /* (0,(iHeight - 1)) corner */
    tmp.X = srcPointQ16.X + 0 + tOffsetQ16.X;
    iXQ16 = tmp.X - centerQ16.X;

    tPointCornerFx[0][1].Y =
        __QDADD(__QDADD(centerQ16.Y, MULTFX(iYQ16, cosAngleFx)), MULTFX(iXQ16, sinAngleFx));
    tPointCornerFx[0][1].X =
        __QDSUB(__QDADD(centerQ16.X, MULTFX(iXQ16, cosAngleFx)), MULTFX(iYQ16, sinAngleFx));
    /*
       Check whether rotated index offsets could exceed 16-bit limits
       used in subsequent gather loads
       This will occur for parts of large images (e.g. 320*200)
       To avoid unconditional penalties for small/medium images,
       returns a speculative overflow allowing to handle large offsets.
    */
    int32_t maxY = MAX(MAX
                        (MAX(tPointCornerFx[0][0].Y, tPointCornerFx[0][1].Y),
                            tPointCornerFx[1][0].Y),
                                tPointCornerFx[1][1].Y);

    if(MULTFX(TO_Q16(iOrigStride), maxY) > UINT16_MAX)
        gatherLoadIdxOverflow = true;


    /* regression */
    int32_t           slopeXFx, slopeYFx;

    /* interpolation in Y direction for 1st elements column */
    slopeXFx = MULTFX((tPointCornerFx[0][1].X - tPointCornerFx[0][0].X), invHeightFx);
    slopeYFx = MULTFX((tPointCornerFx[0][1].Y - tPointCornerFx[0][0].Y), invHeightFx);

    regrCoefs[0].slopeY = slopeYFx * 2;
    regrCoefs[0].slopeX = slopeXFx * 2;
    regrCoefs[0].interceptY = tPointCornerFx[0][0].Y;
    regrCoefs[0].interceptX = tPointCornerFx[0][0].X;


    /* interpolation in Y direction for the last elements column */
    slopeXFx = MULTFX((tPointCornerFx[1][1].X - tPointCornerFx[1][0].X), invHeightFx);
    slopeYFx = MULTFX((tPointCornerFx[1][1].Y - tPointCornerFx[1][0].Y), invHeightFx);

    regrCoefs[1].slopeY = slopeYFx* 2;
    regrCoefs[1].slopeX = slopeXFx* 2;
    regrCoefs[1].interceptY = tPointCornerFx[1][0].Y;
    regrCoefs[1].interceptX = tPointCornerFx[1][0].X;

    return gatherLoadIdxOverflow;
}

#endif


static
void __arm_2d_impl_gray8_get_pixel_colour(arm_2d_point_s16x8_t * ptPoint,
                                                        arm_2d_region_t * ptOrigValidRegion,
                                                        uint8_t * pOrigin,
                                                        int16_t iOrigStride,
                                                        uint8_t * pTarget,
                                                        uint8_t MaskColour, uint32_t elts)
{
    mve_pred16_t    predTail = vctp16q(elts);
    uint16x8_t      vTarget = vldrbq_u16(pTarget);

#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) &&  __ARM_2D_HAS_INTERPOLATION_ROTATION__
    int16x8_t       vOne = vdupq_n_s16(SET_Q6INT(1));
    int16x8_t       vXi = GET_Q6INT(ptPoint->X);
    int16x8_t       vYi = GET_Q6INT(ptPoint->Y);

    vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_s16(ptPoint->X, 0));
    vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_s16(ptPoint->Y, 0));

    int16x8_t       vWX = ptPoint->X - SET_Q6INT(vXi);
    int16x8_t       vWY = ptPoint->Y - SET_Q6INT(vYi);

    /* combination of Bottom / Top & Left / Right areas contributions */
    uint16x8_t      vAreaTR = vmulq_u16(vWX, vWY);
    uint16x8_t      vAreaTL = vmulq_u16((vOne - vWX), vWY);
    uint16x8_t      vAreaBR = vmulq_u16(vWX, (vOne - vWY));
    uint16x8_t      vAreaBL = vmulq_u16((vOne - vWX), (vOne - vWY));

    /* Q16 conversion */
    vAreaTR = vqshlq_n_u16(vAreaTR, 4);
    vAreaTL = vqshlq_n_u16(vAreaTL, 4);
    vAreaBR = vqshlq_n_u16(vAreaBR, 4);
    vAreaBL = vqshlq_n_u16(vAreaBL, 4);


    /* accumulated pixel vectors */
    uint16x8_t      vAvgPixel;

    /* predicate accumulator */
    /* tracks all predications conditions for selecting final */
    /* averaged pixed / target pixel */
    /* can probably optimized away since averaged target pixel is */
    /* equivalent to original pixel, but precision limitations would introduce small errors */
    mve_pred16_t    predGlb = 0;

    /*
     * accumulate / average over the 4 neigbouring pixels
     */

    uint16x8_t      ptVal8;
    /* Bottom Left averaging */
    {
        __ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vXi, vYi, ptVal8, MaskColour, predGlb);

        __ARM_2D_SCALE_GRAY8VEC(vAvgPixel, ptVal8, vAreaBL);
    }


    /* Bottom Right averaging */
    {
        __ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vYi, ptVal8, MaskColour,
                                                 predGlb);

        __ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaBR);
    }

    /* Top Left averaging */
    {
        __ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vXi, vaddq_n_s16(vYi, 1), ptVal8, MaskColour,
                                                 predGlb);

        __ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaTL);
    }

    /* Top Right averaging */
    {
        __ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
                                                 ptVal8, MaskColour, predGlb);

        __ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaTR);
    }


    /* select between target pixel, averaged pixed */
    vTarget = vpselq_u16(vAvgPixel, vTarget, predGlb);

#else
    /* extract integer part */
    arm_2d_point_s16x8_t vPoint = {
        .X = GET_Q6INT(ptPoint->X),
        .Y = GET_Q6INT(ptPoint->Y)
    };

    /* set vector predicate if point is inside the region */
    mve_pred16_t    p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);

    /* prepare vector of point offsets */
    /* correctionOffset avoid 16-bit overflow */
    int16_t         correctionOffset = vminvq_s16(0x7fff, vPoint.Y) - 1;
    uint16x8_t      ptOffs = vPoint.X + (vPoint.Y - correctionOffset) * iOrigStride;

    /* base pointer update to compensate offset */
    pOrigin += (correctionOffset * iOrigStride);

    /* retrieve all point values */
    uint16x8_t      ptVal = vldrbq_gather_offset_z_u16(pOrigin, ptOffs, predTail);

    /* combine 2 predicates set to true if point is in the region & values different from color mask */
    vTarget = vpselq_u16(ptVal, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));
#endif

    vstrbq_p_u16(pTarget, vTarget, predTail);
}


static
void __arm_2d_impl_gray8_get_pixel_colour_with_alpha(arm_2d_point_s16x8_t * ptPoint,
                                                                   arm_2d_region_t *
                                                                   ptOrigValidRegion,
                                                                   uint8_t * pOrigin,
                                                                   int16_t iOrigStride,
                                                                   uint8_t * pTarget,
                                                                   uint8_t MaskColour,
                                                                   uint8_t chOpacity, uint32_t elts)
{
    mve_pred16_t    predTail = vctp16q(elts);
    uint16x8_t      vTarget = vldrbq_u16(pTarget);

#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) &&  __ARM_2D_HAS_INTERPOLATION_ROTATION__
    int16x8_t       vOne = vdupq_n_s16(SET_Q6INT(1));
    int16x8_t       vXi = GET_Q6INT(ptPoint->X);
    int16x8_t       vYi = GET_Q6INT(ptPoint->Y);

    vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_s16(ptPoint->X, 0));
    vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_s16(ptPoint->Y, 0));

    int16x8_t       vWX = ptPoint->X - SET_Q6INT(vXi);
    int16x8_t       vWY = ptPoint->Y - SET_Q6INT(vYi);

    /* combination of Bottom / Top & Left / Right areas contributions */
    uint16x8_t      vAreaTR = vmulq_u16(vWX, vWY);
    uint16x8_t      vAreaTL = vmulq_u16((vOne - vWX), vWY);
    uint16x8_t      vAreaBR = vmulq_u16(vWX, (vOne - vWY));
    uint16x8_t      vAreaBL = vmulq_u16((vOne - vWX), (vOne - vWY));

    /* Q16 conversion */
    vAreaTR = vqshlq_n_u16(vAreaTR, 4);
    vAreaTL = vqshlq_n_u16(vAreaTL, 4);
    vAreaBR = vqshlq_n_u16(vAreaBR, 4);
    vAreaBL = vqshlq_n_u16(vAreaBL, 4);


    /* accumulated pixel vectors */
    uint16x8_t      vAvgPixel;

    /* predicate accumulator */
    /* tracks all predications conditions for selecting final */
    /* averaged pixed / target pixel */
    /* can probably optimized away since averaged target pixel is */
    /* equivalent to original pixel, but precision limitations would introduce small errors */
    mve_pred16_t    predGlb = 0;

    /*
     * accumulate / average over the 4 neigbouring pixels
     */

    uint16x8_t      ptVal8;
    /* Bottom Left averaging */
    {
        __ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vXi, vYi, ptVal8, MaskColour, predGlb);

        __ARM_2D_SCALE_GRAY8VEC(vAvgPixel, ptVal8, vAreaBL);
    }


    /* Bottom Right averaging */
    {
        __ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vYi, ptVal8, MaskColour,
                                                 predGlb);

        __ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaBR);
    }

    /* Top Left averaging */
    {
        __ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vXi, vaddq_n_s16(vYi, 1), ptVal8, MaskColour,
                                                 predGlb);

        __ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaTL);
    }

    /* Top Right averaging */
    {
        __ARM_2D_GRAY8_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
                                                 ptVal8, MaskColour, predGlb);

        __ARM_2D_SCALE_GRAY8VEC_ACC(vAvgPixel, ptVal8, vAreaTR);
    }


    /* alpha blending */
    uint16_t        chTransparency = 256 - (chOpacity);
    uint16x8_t      vBlended = (vAvgPixel * chTransparency + vTarget * chOpacity) >> 8;


    /* select between target pixel, averaged pixed */
    vTarget = vpselq_u16(vBlended, vTarget, predGlb);

#else
    /* extract integer part */
    arm_2d_point_s16x8_t vPoint = {
        .X = GET_Q6INT(ptPoint->X),
        .Y = GET_Q6INT(ptPoint->Y)
    };

    /* set vector predicate if point is inside the region */
    mve_pred16_t    p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);
    /* prepare vector of point offsets */
    /* correctionOffset avoid 16-bit overflow */
    int16_t         correctionOffset = vminvq_s16(0x7fff, vPoint.Y) - 1;
    uint16x8_t      ptOffs = vPoint.X + (vPoint.Y - correctionOffset) * iOrigStride;

    /* retrieve all point values */
    /* base pointer update to compensate offset */
    pOrigin += (correctionOffset * iOrigStride);

    uint16x8_t      ptVal = vldrbq_gather_offset_z_u16(pOrigin, ptOffs, predTail);

    /* alpha blending */
    uint16_t        chTransparency = 256 - (chOpacity);
    uint16x8_t      vBlended = (ptVal * chTransparency + vTarget * chOpacity) >> 8;

    /* combine 2 predicates, set to true, if point is in the region & values different from color mask */
    vTarget = vpselq_u16(vBlended, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));

#endif

    vstrbq_p_u16(pTarget, vTarget, predTail);
}


static
void __arm_2d_impl_rgb565_get_pixel_colour(   arm_2d_point_s16x8_t *ptPoint,
                                            arm_2d_region_t *ptOrigValidRegion,
                                            uint16_t *pOrigin,
                                            int16_t iOrigStride,
                                            uint16_t *pTarget,
                                            uint16_t MaskColour,
                                            uint32_t elts)
{
    mve_pred16_t    predTail = vctp16q(elts);
    uint16x8_t      vTarget = vld1q(pTarget);

#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) &&  __ARM_2D_HAS_INTERPOLATION_ROTATION__
    int16x8_t       vOne = vdupq_n_s16(SET_Q6INT(1));
    int16x8_t       vXi = GET_Q6INT(ptPoint->X);
    int16x8_t       vYi = GET_Q6INT(ptPoint->Y);

    vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_s16(ptPoint->X, 0));
    vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_s16(ptPoint->Y, 0));

    int16x8_t       vWX = ptPoint->X - SET_Q6INT(vXi);
    int16x8_t       vWY = ptPoint->Y - SET_Q6INT(vYi);

    /* combination of Bottom / Top & Left / Right areas contributions */
    uint16x8_t      vAreaTR = vmulq_u16(vWX, vWY);
    uint16x8_t      vAreaTL = vmulq_u16((vOne - vWX), vWY);
    uint16x8_t      vAreaBR = vmulq_u16(vWX, (vOne - vWY));
    uint16x8_t      vAreaBL = vmulq_u16((vOne - vWX), (vOne - vWY));

    /* Q16 conversion */
    vAreaTR = vqshlq_n_u16(vAreaTR, 4);
    vAreaTL = vqshlq_n_u16(vAreaTL, 4);
    vAreaBR = vqshlq_n_u16(vAreaBR, 4);
    vAreaBL = vqshlq_n_u16(vAreaBL, 4);


    /* accumulated pixel vectors */
    uint16x8_t       vAvgPixelR, vAvgPixelG, vAvgPixelB;

    /* predicate accumulator */
    /* tracks all predications conditions for selecting final */
    /* averaged pixed / target pixel */
    /* can probably optimized away since averaged target pixel is */
    /* equivalent to original pixel, but precision limitations would introduce small errors */
    mve_pred16_t    predGlb = 0;

    /*
     * accumulate / average over the 4 neigbouring pixels
     */

    uint16x8_t      R, G, B;
    /* Bottom Left averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vXi, vYi, R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
    }


    /* Bottom Right averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vYi, R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
    }

    /* Top Left averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vXi, vaddq_n_s16(vYi, 1), R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
    }

    /* Top Right averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
                                                                R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
    }

    /* pack */
    uint16x8_t      TempPixel = __arm_2d_rgb565_pack_single_vec(vAvgPixelR,
                                                              vAvgPixelG,
                                                              vAvgPixelB);

    /* select between target pixel, averaged pixed */
    vTarget = vpselq_u16(TempPixel, vTarget, predGlb);


#else
    /* extract integer part */
    arm_2d_point_s16x8_t vPoint = {
            .X = GET_Q6INT(ptPoint->X),
            .Y = GET_Q6INT(ptPoint->Y)};

    /* set vector predicate if point is inside the region */
    mve_pred16_t    p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);
    /* prepare vector of point offsets */
    uint16x8_t      ptOffs = vPoint.X + vPoint.Y * iOrigStride;
    /* retrieve all point values */
    uint16x8_t      ptVal = vldrhq_gather_shifted_offset_z_u16(pOrigin, ptOffs, predTail);

    /* combine 2 predicates set to true if point is in the region & values different from color mask */
    vTarget = vpselq_u16(ptVal, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));

#endif
    vst1q_p(pTarget, vTarget, predTail);
}

static
void __arm_2d_impl_rgb565_get_pixel_colour_offs_compensated(   arm_2d_point_s16x8_t *ptPoint,
                                            arm_2d_region_t    *ptOrigValidRegion,
                                            uint16_t           *pOrigin,
                                            int16_t             iOrigStride,
                                            uint16_t           *pTarget,
                                            uint16_t            MaskColour,
                                            uint32_t            elts)
{
    mve_pred16_t    predTail = vctp16q(elts);
    uint16x8_t      vTarget = vld1q(pTarget);

#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) &&  __ARM_2D_HAS_INTERPOLATION_ROTATION__
    int16x8_t       vOne = vdupq_n_s16(SET_Q6INT(1));
    int16x8_t       vXi = GET_Q6INT(ptPoint->X);
    int16x8_t       vYi = GET_Q6INT(ptPoint->Y);

    vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_s16(ptPoint->X, 0));
    vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_s16(ptPoint->Y, 0));

    int16x8_t       vWX = ptPoint->X - SET_Q6INT(vXi);
    int16x8_t       vWY = ptPoint->Y - SET_Q6INT(vYi);

    /* combination of Bottom / Top & Left / Right areas contributions */
    uint16x8_t      vAreaTR = vmulq_u16(vWX, vWY);
    uint16x8_t      vAreaTL = vmulq_u16((vOne - vWX), vWY);
    uint16x8_t      vAreaBR = vmulq_u16(vWX, (vOne - vWY));
    uint16x8_t      vAreaBL = vmulq_u16((vOne - vWX), (vOne - vWY));

    /* Q16 conversion */
    vAreaTR = vqshlq_n_u16(vAreaTR, 4);
    vAreaTL = vqshlq_n_u16(vAreaTL, 4);
    vAreaBR = vqshlq_n_u16(vAreaBR, 4);
    vAreaBL = vqshlq_n_u16(vAreaBL, 4);


    /* accumulated pixel vectors */
    uint16x8_t       vAvgPixelR, vAvgPixelG, vAvgPixelB;

    /* predicate accumulator */
    /* tracks all predications conditions for selecting final */
    /* averaged pixed / target pixel */
    /* can probably optimized away since averaged target pixel is */
    /* equivalent to original pixel, but precision limitations would introduce small errors */
    mve_pred16_t    predGlb = 0;

    /*
     * accumulate / average over the 4 neigbouring pixels
     */

    uint16x8_t      R, G, B;
    /* Bottom Left averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vXi, vYi, R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
    }


    /* Bottom Right averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vYi, R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
    }

    /* Top Left averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vXi, vaddq_n_s16(vYi, 1), R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
    }

    /* Top Right averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
                                                                R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
    }

    /* pack */
    uint16x8_t      TempPixel = __arm_2d_rgb565_pack_single_vec(vAvgPixelR,
                                                              vAvgPixelG,
                                                              vAvgPixelB);

    /* select between target pixel, averaged pixed */
    vTarget = vpselq_u16(TempPixel, vTarget, predGlb);


#else
    /* extract integer part */
    arm_2d_point_s16x8_t vPoint = {
            .X = GET_Q6INT(ptPoint->X),
            .Y = GET_Q6INT(ptPoint->Y)};

    /* set vector predicate if point is inside the region */
    mve_pred16_t    p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);

    /* prepare vector of point offsets */
    /* correctionOffset avoid 16-bit overflow */
    int16_t         correctionOffset = vminvq_s16(0x7fff, vPoint.Y) - 1;
    uint16x8_t      ptOffs =
        vPoint.X + (vPoint.Y - correctionOffset) * iOrigStride;

    /* base pointer update to compensate offset */
    pOrigin += (correctionOffset * iOrigStride);

    /* retrieve all point values */
    uint16x8_t      ptVal = vldrhq_gather_shifted_offset_z_u16(pOrigin, ptOffs, predTail);

    /* combine 2 predicates set to true if point is in the region & values different from color mask */
    vTarget = vpselq_u16(ptVal, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));
#endif

    vst1q_p(pTarget, vTarget, predTail);
}


static
void __arm_2d_impl_rgb565_get_pixel_colour_with_alpha(
                                            arm_2d_point_s16x8_t    *ptPoint,
                                            arm_2d_region_t         *ptOrigValidRegion,
                                            uint16_t                *pOrigin,
                                            int16_t                  iOrigStride,
                                            uint16_t                *pTarget,
                                            uint16_t                 MaskColour,
                                            uint8_t                  chOpacity,
                                            uint32_t                 elts)
{
    mve_pred16_t    predTail = vctp16q(elts);
    uint16x8_t      vTarget = vld1q(pTarget);

#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) &&  __ARM_2D_HAS_INTERPOLATION_ROTATION__
    int16x8_t       vOne = vdupq_n_s16(SET_Q6INT(1));
    int16x8_t       vXi = GET_Q6INT(ptPoint->X);
    int16x8_t       vYi = GET_Q6INT(ptPoint->Y);

    vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_s16(ptPoint->X, 0));
    vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_s16(ptPoint->Y, 0));

    int16x8_t       vWX = ptPoint->X - SET_Q6INT(vXi);
    int16x8_t       vWY = ptPoint->Y - SET_Q6INT(vYi);

    /* combination of Bottom / Top & Left / Right areas contributions */
    uint16x8_t      vAreaTR = vmulq_u16(vWX, vWY);
    uint16x8_t      vAreaTL = vmulq_u16((vOne - vWX), vWY);
    uint16x8_t      vAreaBR = vmulq_u16(vWX, (vOne - vWY));
    uint16x8_t      vAreaBL = vmulq_u16((vOne - vWX), (vOne - vWY));

    /* Q16 conversion */
    vAreaTR = vqshlq_n_u16(vAreaTR, 4);
    vAreaTL = vqshlq_n_u16(vAreaTL, 4);
    vAreaBR = vqshlq_n_u16(vAreaBR, 4);
    vAreaBL = vqshlq_n_u16(vAreaBL, 4);


    /* accumulated pixel vectors */
    uint16x8_t       vAvgPixelR, vAvgPixelG, vAvgPixelB;

    /* predicate accumulator */
    /* tracks all predications conditions for selecting final */
    /* averaged pixed / target pixel */
    /* can probably optimized away since averaged target pixel is */
    /* equivalent to original pixel, but precision limitations would introduce small errors */
    mve_pred16_t    predGlb = 0;

    /*
     * accumulate / average over the 4 neigbouring pixels
     */

    uint16x8_t      R, G, B;
    /* Bottom Left averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vXi, vYi, R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
    }


    /* Bottom Right averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vYi, R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
    }

    /* Top Left averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vXi, vaddq_n_s16(vYi, 1), R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
    }

    /* Top Right averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
                                                                R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
    }


    /* alpha blending */
    uint16x8_t      vBlended;

    __ARM_2D_BLEND_RGB565_TARGET_RGBVEC(chOpacity, vTarget, vAvgPixelR, vAvgPixelG, vAvgPixelB, vBlended);


    /* select between target pixel, averaged pixed */
    vTarget = vpselq_u16(vBlended, vTarget, predGlb);

#else
    /* extract integer part */
    arm_2d_point_s16x8_t vPoint = {
            .X = GET_Q6INT(ptPoint->X),
            .Y = GET_Q6INT(ptPoint->Y)};

    /* set vector predicate if point is inside the region */
    mve_pred16_t    p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);
    /* prepare vector of point offsets */
    uint16x8_t      ptOffs = vPoint.X + vPoint.Y * iOrigStride;
    /* retrieve all point values */
    uint16x8_t      ptVal = vldrhq_gather_shifted_offset_u16(pOrigin, ptOffs);

    /* alpha blending */
    uint16x8_t      vBlended =
        __arm_2d_rgb565_alpha_blending_single_vec(ptVal, vTarget, chOpacity);

    /* combine 2 predicates, set to true, if point is in the region & values different from color mask */
    vTarget = vpselq_u16(vBlended, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));
#endif

    vst1q_p(pTarget, vTarget, predTail);
}


static
void __arm_2d_impl_rgb565_get_pixel_colour_with_alpha_offs_compensated(
                                            arm_2d_point_s16x8_t    *ptPoint,
                                            arm_2d_region_t         *ptOrigValidRegion,
                                            uint16_t                *pOrigin,
                                            int16_t                  iOrigStride,
                                            uint16_t                *pTarget,
                                            uint16_t                 MaskColour,
                                            uint8_t                  chOpacity,
                                            uint32_t                 elts)
{
    mve_pred16_t    predTail = vctp16q(elts);
    uint16x8_t      vTarget = vld1q(pTarget);

#if defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) &&  __ARM_2D_HAS_INTERPOLATION_ROTATION__
    int16x8_t       vOne = vdupq_n_s16(SET_Q6INT(1));
    int16x8_t       vXi = GET_Q6INT(ptPoint->X);
    int16x8_t       vYi = GET_Q6INT(ptPoint->Y);

    vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_s16(ptPoint->X, 0));
    vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_s16(ptPoint->Y, 0));

    int16x8_t       vWX = ptPoint->X - SET_Q6INT(vXi);
    int16x8_t       vWY = ptPoint->Y - SET_Q6INT(vYi);

    /* combination of Bottom / Top & Left / Right areas contributions */
    uint16x8_t      vAreaTR = vmulq_u16(vWX, vWY);
    uint16x8_t      vAreaTL = vmulq_u16((vOne - vWX), vWY);
    uint16x8_t      vAreaBR = vmulq_u16(vWX, (vOne - vWY));
    uint16x8_t      vAreaBL = vmulq_u16((vOne - vWX), (vOne - vWY));

    /* Q16 conversion */
    vAreaTR = vqshlq_n_u16(vAreaTR, 4);
    vAreaTL = vqshlq_n_u16(vAreaTL, 4);
    vAreaBR = vqshlq_n_u16(vAreaBR, 4);
    vAreaBL = vqshlq_n_u16(vAreaBL, 4);


    /* accumulated pixel vectors */
    uint16x8_t       vAvgPixelR, vAvgPixelG, vAvgPixelB;

    /* predicate accumulator */
    /* tracks all predications conditions for selecting final */
    /* averaged pixed / target pixel */
    /* can probably optimized away since averaged target pixel is */
    /* equivalent to original pixel, but precision limitations would introduce small errors */
    mve_pred16_t    predGlb = 0;

    /*
     * accumulate / average over the 4 neigbouring pixels
     */

    uint16x8_t      R, G, B;
    /* Bottom Left averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vXi, vYi, R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
    }


    /* Bottom Right averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vYi, R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
    }

    /* Top Left averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vXi, vaddq_n_s16(vYi, 1), R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
    }

    /* Top Right averaging */
    {
        __ARM_2D_RGB565_GET_RGBVEC_FROM_POINT_FAR(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
                                                                R, G, B, MaskColour, predGlb);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
    }

    /* alpha blending */
    uint16x8_t      vBlended;

    __ARM_2D_BLEND_RGB565_TARGET_RGBVEC(chOpacity, vTarget, vAvgPixelR, vAvgPixelG, vAvgPixelB, vBlended);


    /* select between target pixel, averaged pixed */
    vTarget = vpselq_u16(vBlended, vTarget, predGlb);

#else
    /* extract integer part */
    arm_2d_point_s16x8_t vPoint = {
            .X = GET_Q6INT(ptPoint->X),
            .Y = GET_Q6INT(ptPoint->Y)};

    /* set vector predicate if point is inside the region */
    mve_pred16_t    p = arm_2d_is_point_vec_inside_region_s16(ptOrigValidRegion, &vPoint);
    /* prepare vector of point offsets */
    /* correctionOffset avoid 16-bit overflow */
    int16_t         correctionOffset = vminvq_s16(0x7fff, vPoint.Y) - 1;
    uint16x8_t      ptOffs =
        vPoint.X + (vPoint.Y - correctionOffset) * iOrigStride;

    /* retrieve all point values */
    /* base pointer update to compensate offset */
    pOrigin += (correctionOffset * iOrigStride);

    uint16x8_t      ptVal = vldrhq_gather_shifted_offset_z_u16(pOrigin, ptOffs, predTail);

    /* alpha blending */
    uint16x8_t      vBlended =
        __arm_2d_rgb565_alpha_blending_single_vec(ptVal, vTarget, chOpacity);

    /* combine 2 predicates, set to true, if point is in the region & values different from color mask */
    vTarget = vpselq_u16(vBlended, vTarget, vcmpneq_m_n_u16(ptVal, MaskColour, p));

#endif

    vst1q_p(pTarget, vTarget, predTail);
}

static
void __arm_2d_impl_cccn888_get_pixel_colour(   arm_2d_point_s16x8_t *ptPoint,
                                            arm_2d_region_t *ptOrigValidRegion,
                                            uint32_t *pOrigin,
                                            int16_t iOrigStride,
                                            uint32_t *pTarget,
                                            uint32_t MaskColour,
                                            int16_t elts)
{
#if     defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__) &&  __ARM_2D_HAS_INTERPOLATION_ROTATION__
    ARM_ALIGN(8) uint32_t scratch32[32];
    int16_t        *pscratch16 = (int16_t *) scratch32;
    uint32x4_t      vTargetLo = vld1q(pTarget);
    uint32x4_t      vTargetHi = vld1q(pTarget + 4);
    mve_pred16_t    predTailLow = vctp32q(elts);
    mve_pred16_t    predTailHigh = elts - 4 > 0 ? vctp32q(elts - 4) : 0;
    int16x8_t       vOne = vdupq_n_s16(SET_Q6INT(1));

    int16x8_t       vXi = GET_Q6INT(ptPoint->X);
    int16x8_t       vYi = GET_Q6INT(ptPoint->Y);

    vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_s16(ptPoint->X, 0));
    vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_s16(ptPoint->Y, 0));

    int16x8_t       vWX = ptPoint->X - (SET_Q6INT(vXi));
    int16x8_t       vWY = ptPoint->Y - (SET_Q6INT(vYi));

    /* combination of Bottom / Top & Left / Right areas contributions */
    uint16x8_t      vAreaTR = vmulq_u16(vWX, vWY);
    uint16x8_t      vAreaTL = vmulq_u16((vOne - vWX), vWY);
    uint16x8_t      vAreaBR = vmulq_u16(vWX, (vOne - vWY));
    uint16x8_t      vAreaBL = vmulq_u16((vOne - vWX), (vOne - vWY));

    /* Q16 conversion */
    vAreaTR = vqshlq_n_u16(vAreaTR, 4);
    vAreaTL = vqshlq_n_u16(vAreaTL, 4);
    vAreaBR = vqshlq_n_u16(vAreaBR, 4);
    vAreaBL = vqshlq_n_u16(vAreaBL, 4);


    /* accumulated pixel vectors */
    uint16x8_t      vAvgPixelR, vAvgPixelG, vAvgPixelB;

    /* predicate accumulators */
    /* tracks all predications conditions for selecting final */
    /* averaged pixed / target pixel */
    /* can probably optimized away since averaged target pixel is */
    /* equivalent to original pixel, but precision limitations would introduce small errors */
    mve_pred16_t    predGlbLo = 0, predGlbHi = 0;

    /*
     * accumulate / average over the 4 neigbouring pixels
     */

    uint16x8_t      R, G, B;

    /* Bottom Left averaging */
    {
        __ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vXi, vYi, R, G, B, MaskColour, predGlbLo, predGlbHi);

        __ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
    }

    /* Bottom Right averaging */
    {
        __ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vYi,
                            R, G, B, MaskColour, predGlbLo, predGlbHi);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
    }

    /* Top Left averaging */
    {
        __ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vXi, vaddq_n_s16(vYi, 1),
                                                R, G, B, MaskColour, predGlbLo, predGlbHi);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
    }

    /* Top Right averaging */
    {
        __ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
                                                R, G, B, MaskColour, predGlbLo, predGlbHi);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
    }


    /* pack */
    __arm_2d_pack_rgb888_to_mem((uint8_t *) scratch32, vAvgPixelR, vAvgPixelG, vAvgPixelB);

    uint32x4_t      TempPixel = vld1q(scratch32);

    /* select between target pixel, averaged pixed */
    TempPixel = vpselq_u32(TempPixel, vTargetLo, predGlbLo);

    vst1q_p(pTarget, TempPixel, predTailLow);

    TempPixel = vld1q(scratch32 + 4);

    /* select between target pixel, averaged pixed */
    TempPixel = vpselq_u32(TempPixel, vTargetHi, predGlbHi);

    vst1q_p(pTarget + 4, TempPixel, predTailHigh);
#else

    arm_2d_point_s32x4_t    tPointLo, tPointHi;
    ARM_ALIGN(8)  int16_t         scratch[8];
    mve_pred16_t            p;

    /* split 16-bit point vector into 2 x 32-bit vectors */
    vst1q(scratch, GET_Q6INT(ptPoint->X));
    tPointLo.X = vldrhq_s32(scratch);
    tPointHi.X = vldrhq_s32(scratch + 4);

    vst1q(scratch, GET_Q6INT(ptPoint->Y));
    tPointLo.Y = vldrhq_s32(scratch);
    tPointHi.Y = vldrhq_s32(scratch + 4);

    /* 1st half */

    /* set vector predicate if point is inside the region */
    p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointLo);
    /* prepare vector of point offsets */
    uint32x4_t      ptOffs = tPointLo.X + tPointLo.Y * iOrigStride;
    uint32x4_t      vPixel = vld1q(pTarget);
    /* retrieve all point values */
    uint32x4_t      ptVal = vldrwq_gather_shifted_offset_u32(pOrigin, ptOffs);

    /* combine 2 predicates set to true if point is in the region & values different from color mask */
    vPixel = vpselq_u32(ptVal, vPixel, vcmpneq_m_n_u32(ptVal, MaskColour, p));

    vst1q_p(pTarget, vPixel, vctp32q(elts));

    elts -= 4;
    if (elts > 0) {

        /* second half */
        p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointHi);
        ptOffs = tPointHi.X + tPointHi.Y * iOrigStride;
        vPixel = vld1q(pTarget + 4);

        ptVal = vldrwq_gather_shifted_offset_u32(pOrigin, ptOffs);
        vPixel = vpselq_u32(ptVal, vPixel, vcmpneq_m_n_u32(ptVal, MaskColour, p));
        vst1q_p(pTarget + 4, vPixel, vctp32q(elts));
    }
#endif
}

static
void __arm_2d_impl_cccn888_get_pixel_colour_with_alpha(
                                            arm_2d_point_s16x8_t    *ptPoint,
                                            arm_2d_region_t         *ptOrigValidRegion,
                                            uint32_t                *pOrigin,
                                            int16_t                  iOrigStride,
                                            uint32_t                *pTarget,
                                            uint32_t                 MaskColour,
                                            uint8_t                  chOpacity,
                                            int16_t                  elts)
{
#if     defined(__ARM_2D_HAS_INTERPOLATION_ROTATION__)  &&  __ARM_2D_HAS_INTERPOLATION_ROTATION__
    ARM_ALIGN(8) uint32_t scratch32[32];
    int16_t        *pscratch16 = (int16_t *) scratch32;
    uint32x4_t      vTargetLo = vld1q(pTarget);
    uint32x4_t      vTargetHi = vld1q(pTarget + 4);
    mve_pred16_t    predTailLow = vctp32q(elts);
    mve_pred16_t    predTailHigh = elts - 4 > 0 ? vctp32q(elts - 4) : 0;
    int16x8_t       vOne = vdupq_n_s16(SET_Q6INT(1));

    int16x8_t       vXi = GET_Q6INT(ptPoint->X);
    int16x8_t       vYi = GET_Q6INT(ptPoint->Y);

    vXi = vsubq_m_n_s16(vXi, vXi, 1, vcmpltq_n_s16(ptPoint->X, 0));
    vYi = vsubq_m_n_s16(vYi, vYi, 1, vcmpltq_n_s16(ptPoint->Y, 0));

    int16x8_t       vWX = ptPoint->X - (SET_Q6INT(vXi));
    int16x8_t       vWY = ptPoint->Y - (SET_Q6INT(vYi));

    /* combination of Bottom / Top & Left / Right areas contributions */
    uint16x8_t      vAreaTR = vmulq_u16(vWX, vWY);
    uint16x8_t      vAreaTL = vmulq_u16((vOne - vWX), vWY);
    uint16x8_t      vAreaBR = vmulq_u16(vWX, (vOne - vWY));
    uint16x8_t      vAreaBL = vmulq_u16((vOne - vWX), (vOne - vWY));

    /* Q16 conversion */
    vAreaTR = vqshlq_n_u16(vAreaTR, 4);
    vAreaTL = vqshlq_n_u16(vAreaTL, 4);
    vAreaBR = vqshlq_n_u16(vAreaBR, 4);
    vAreaBL = vqshlq_n_u16(vAreaBL, 4);


    /* accumulated pixel vectors */
    uint16x8_t      vAvgPixelR, vAvgPixelG, vAvgPixelB;

    /* predicate accumulators */
    /* tracks all predications conditions for selecting final */
    /* averaged pixed / target pixel */
    /* can probably optimized away since averaged target pixel is */
    /* equivalent to original pixel, but precision limitations would introduce small errors */
    mve_pred16_t    predGlbLo = 0, predGlbHi = 0;

    /*
     * accumulate / average over the 4 neigbouring pixels
     */

    uint16x8_t      R, G, B;

    /* Bottom Left averaging */
    {
        __ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vXi, vYi, R, G, B, MaskColour, predGlbLo, predGlbHi);

        __ARM_2D_SCALE_RGBVEC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBL);
    }

    /* Bottom Right averaging */
    {
        __ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vYi,
                            R, G, B, MaskColour, predGlbLo, predGlbHi);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaBR);
    }

    /* Top Left averaging */
    {
        __ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vXi, vaddq_n_s16(vYi, 1),
                                                R, G, B, MaskColour, predGlbLo, predGlbHi);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTL);
    }

    /* Top Right averaging */
    {
        __ARM_2D_RGB888_GET_RGBVEC_FROM_POINT(vaddq_n_s16(vXi, 1), vaddq_n_s16(vYi, 1),
                                                R, G, B, MaskColour, predGlbLo, predGlbHi);

        __ARM_2D_SCALE_RGBVEC_ACC(vAvgPixelR, vAvgPixelG, vAvgPixelB, R, G, B, vAreaTR);
    }

    /* alpha blending */
    uint16x8_t      vTargetR, vTargetG, vTargetB;

    __arm_2d_unpack_rgb888_from_mem((const uint8_t *) pTarget, &vTargetR, &vTargetG, &vTargetB);

    uint16_t        chOpacityCompl = (256 - chOpacity);

    /* merge */
    vAvgPixelR = vmlaq_n_u16(vmulq_n_u16(vAvgPixelR, chOpacityCompl), vTargetR, chOpacity);
    vAvgPixelR = vAvgPixelR >> 8;

    vAvgPixelG = vmlaq_n_u16(vmulq_n_u16(vAvgPixelG, chOpacityCompl), vTargetG, chOpacity);
    vAvgPixelG = vAvgPixelG >> 8;

    vAvgPixelB = vmlaq_n_u16(vmulq_n_u16(vAvgPixelB, chOpacityCompl), vTargetB, chOpacity);
    vAvgPixelB = vAvgPixelB >> 8;

    /* pack */
    __arm_2d_pack_rgb888_to_mem((uint8_t *) scratch32, vAvgPixelR, vAvgPixelG, vAvgPixelB);

    uint32x4_t      TempPixel = vld1q(scratch32);

    /* select between target pixel, averaged pixed */
    TempPixel = vpselq_u32(TempPixel, vTargetLo, predGlbLo);

    vst1q_p(pTarget, TempPixel, predTailLow);

    TempPixel = vld1q(scratch32 + 4);

    /* select between target pixel, averaged pixed */
    TempPixel = vpselq_u32(TempPixel, vTargetHi, predGlbHi);

    vst1q_p(pTarget + 4, TempPixel, predTailHigh);
#else
    arm_2d_point_s32x4_t    tPointLo, tPointHi;
    ARM_ALIGN(8) int16_t          scratch[8];
    ARM_ALIGN(8) uint32_t         blendled[4];
    mve_pred16_t            p;

    /* split 16-bit point vector into 2 x 32-bit vectors */
    vst1q(scratch, GET_Q6INT(ptPoint->X));
    tPointLo.X = vldrhq_s32(scratch);
    tPointHi.X = vldrhq_s32(scratch + 4);

    vst1q(scratch, GET_Q6INT(ptPoint->Y));
    tPointLo.Y = vldrhq_s32(scratch);
    tPointHi.Y = vldrhq_s32(scratch + 4);

    /* 1st half */

    /* set vector predicate if point is inside the region */
    p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointLo);
    /* prepare vector of point offsets */
    uint32x4_t      ptOffs = tPointLo.X + tPointLo.Y * iOrigStride;
    uint32x4_t      vPixel = vld1q(pTarget);
    /* retrieve all point values */
    uint32x4_t      ptVal = vldrwq_gather_shifted_offset_u32(pOrigin, ptOffs);

    vstrwq_u32((uint32_t *) scratch, ptVal);

    /* alpha-blending (requires widened inputs) */
    vstrbq_u16((uint8_t *) blendled,
               __rgb888_alpha_blending_direct_single_vec(vldrbq_u16((uint8_t const *) scratch),
                                                         vldrbq_u16((uint8_t const *) pTarget), chOpacity));

    vstrbq_u16((uint8_t *) blendled + 2,
               __rgb888_alpha_blending_direct_single_vec(vldrbq_u16((uint8_t const *)scratch + 4),
                                                         vldrbq_u16((uint8_t const *)pTarget + 2), chOpacity));

    uint32x4_t      vBlended = vld1q(blendled);

    /* combine 2 predicates, set to true, if point is in the region & values different from color mask */
    vPixel = vpselq_u32(vBlended, vPixel, vcmpneq_m_n_u32(ptVal, MaskColour, p));

    vst1q_p(pTarget, vPixel, vctp32q(elts));

    elts -= 4;
    if(elts > 0) {
        /* second half */

        p = arm_2d_is_point_vec_inside_region_s32(ptOrigValidRegion, &tPointHi);
        ptOffs = tPointHi.X + tPointHi.Y * iOrigStride;
        vPixel = vld1q(pTarget);
        ptVal = vldrwq_gather_shifted_offset_u32(pOrigin, ptOffs);

        vstrwq_u32((uint32_t *) scratch, ptVal);

        /* alpha-blending (requires widened inputs) */
        vstrbq_u16((uint8_t *) blendled,
                   __rgb888_alpha_blending_direct_single_vec(vldrbq_u16((uint8_t const *) scratch),
                                                             vldrbq_u16((uint8_t const *) pTarget), chOpacity));
        vstrbq_u16((uint8_t *) blendled + 2,
                   __rgb888_alpha_blending_direct_single_vec(vldrbq_u16((uint8_t const *)scratch + 4),
                                                             vldrbq_u16((uint8_t const *)pTarget + 2), chOpacity));

        vBlended = vld1q(blendled);

        /* combine 2 predicates, set to true, if point is in the region & values different from color mask */
        vPixel = vpselq_u32(vBlended, vPixel, vcmpneq_m_n_u32(ptVal, MaskColour, p));

        vst1q_p(pTarget + 4, vPixel, vctp32q(elts));
    }
#endif
}

#endif


#define __API_INT_TYPE_BIT_NUM      8
#define __API_COLOUR                gray8
#include "__arm_2d_rotate_helium.inc"

#define __API_INT_TYPE_BIT_NUM      16
#define __API_COLOUR                rgb565
#include "__arm_2d_rotate_helium.inc"

#define __API_INT_TYPE_BIT_NUM      32
#define __API_COLOUR                cccn888
#include "__arm_2d_rotate_helium.inc"


/* rgb8_draw_pattern helpers */

/*
 * enable to pick gather load offset based on initial offset
 * e.g. if iOffset = 3
 * will get {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2}
 */
static uint8_t __draw_pattern_src_incr_c8bit[32] = {
    0, 0, 0, 0, 0, 0, 0, 0,
    1, 1, 1, 1, 1, 1, 1, 1,
    2, 2, 2, 2, 2, 2, 2, 2,
    3, 3, 3, 3, 3, 3, 3, 3
};

/*
 * enable to pick vector bitmask based on initial offset
 * e.g. if iOffset = 3
 * will get {8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4}
 */

static  uint8_t __draw_pattern_src_bitmask_c8bit[32] = {
    1, 2, 4, 8, 16, 32, 64, 128,
    1, 2, 4, 8, 16, 32, 64, 128,
    1, 2, 4, 8, 16, 32, 64, 128,
    1, 2, 4, 8, 16, 32, 64, 128,
};


/* rgb16_draw_pattern helpers */

/*
 * enable to pick gather load offset based on initial offset
 * e.g. if iOffset = 3
 * will get {0, 0, 0, 0, 0, 1, 1, 1}
 */
static uint16_t __draw_pattern_src_incr_rgb16[16] = {
    0, 0, 0, 0, 0, 0, 0, 0,
    1, 1, 1, 1, 1, 1, 1, 1
};

/*
 * enable to pick vector bitmask based on initial offset
 * e.g. if iOffset = 3
 * will get {8, 16, 32, 64, 128, 1, 2, 4}
 */

static  uint16_t __draw_pattern_src_bitmask_rgb16[16] = {
    1, 2, 4, 8, 16, 32, 64, 128,
    1, 2, 4, 8, 16, 32, 64, 128,
};


/* rgb32_draw_pattern helpers */

static uint32_t __draw_pattern_src_incr_rgb32[16] = {
    0, 0, 0, 0, 0, 0, 0, 0,
    1, 1, 1, 1, 1, 1, 1, 1
};


static  uint32_t __draw_pattern_src_bitmask_rgb32[16] = {
    1, 2, 4, 8, 16, 32, 64, 128,
    1, 2, 4, 8, 16, 32, 64, 128,
};


/*! adding support with c code template */
#define __API_COLOUR        c8bit
#define __API_ELT_SZ        8
#include "__arm_2d_draw_pattern_helium.inc"


#define __API_COLOUR        rgb16
#define __API_ELT_SZ        16
#include "__arm_2d_draw_pattern_helium.inc"


#define __API_COLOUR        rgb32
#define __API_ELT_SZ        32
#include "__arm_2d_draw_pattern_helium.inc"


#define __API_COLOUR        c8bit
#define __API_ELT_SZ        8
#include "__arm_2d_fill_colour_helium.inc"


#define __API_COLOUR        rgb16
#define __API_ELT_SZ        16
#include "__arm_2d_fill_colour_helium.inc"


#define __API_COLOUR        rgb32
#define __API_ELT_SZ        32
#include "__arm_2d_fill_colour_helium.inc"


/**
  8-bit pixel color fill alpha/channel mask with/without opacity MVE intrinsic generator
   - TRGT_LOAD is a contigous / strided target load function
        C8BIT_TRGT_LOAD / C8BIT_TRGT_LOAD_STRIDE
  - STRIDE is an optional vector of offset for gather load
  - SCAL_OPACITY is extra alpha scaling function
        C8BIT_SCAL_OPACITY_NONE / C8BIT_SCAL_OPACITY
  - OPACITY is an optinal 8-bit vector with duplicated opacity values
  (need vector format to be used with VMULH.U8)
  - ALPHA_SZ, alpha chan width (1 or 4 for resp. 8 or 32-bit type)

  Macro assumes pTarget8/ pAlpha are already setup
 */

#define C8BIT_COLOUR_FILLING_MASK_INNER_MVE(TRGT_LOAD, STRIDE, SCAL_OPACITY,                   \
                                                                      OPACITY, ALPHA_SZ)       \
        int32_t         blkCnt = iWidth;                                                       \
        do {                                                                                   \
            mve_pred16_t    tailPred = vctp16q(blkCnt);                                        \
                                                                                               \
            uint16x8_t      vecTarget = vldrbq_z_u16(pTarget8, tailPred);                      \
            uint16x8_t      vecTransp = TRGT_LOAD(pAlpha, STRIDE, tailPred);                   \
                                                                                               \
            vecTransp = SCAL_OPACITY(vecTransp, OPACITY, tailPred);                            \
            uint16x8_t      vecAlpha = vsubq_x_u16(v256, vecTransp, tailPred);                 \
                                                                                               \
            vecTarget = vmulq_x(vecTarget, vecAlpha, tailPred);                                \
            vecTarget = vmlaq_m(vecTarget, vecTransp, (uint16_t) Colour, tailPred);            \
            vecTarget = vecTarget >> 8;                                                        \
                                                                                               \
            vstrbq_p_u16(pTarget8, vecTarget, tailPred);                                       \
                                                                                               \
            pAlpha += (8 * ALPHA_SZ);                                                          \
            pTarget8 += 8;                                                                     \
            blkCnt -= 8;                                                                       \
        }                                                                                      \
        while (blkCnt > 0);


/**
  RGB565 pixel color fill alpha/channel mask with/without opacity MVE intrinsic generator
   - TRGT_LOAD is a contigous / strided target load function
        RGB565_TRGT_LOAD / RGB565_TRGT_LOAD_STRIDE
  - STRIDE is an optional vector of offset for gather load
  - SCAL_OPACITY is extra alpha scaling function
        RGB565_SCAL_OPACITY_NONE / RGB565_SCAL_OPACITY
  - OPACITY is an optinal 8-bit vector with duplicated opacity values
  (need vector format to be used with VMULH.U8)
  - P_ALPHA, 8-bit or 32-bit alpha chan. pointer
  - ALPHA_SZ, alpha chan width (1 or 4 for resp. 8 or 32-bit type)

  Does not generate a tail-predicated loop as relying on pack/unpack functions.
  Predication is only applied in the final stage during pixel store.
 */

#define RGB565_COLOUR_FILLING_MASK_MVE(TRGT_LOAD, STRIDE, SCAL_OPACITY, OPACITY,               \
                                                                 P_ALPHA,  ALPHA_SZ)           \
    uint16x8_t      v256 = vdupq_n_u16(256);                                                   \
                                                                                               \
    for (int_fast16_t y = 0; y < iHeight; y++) {                                               \
        const uint8_t  *pAlpha = (const uint8_t *)P_ALPHA;                                     \
        uint16_t       *pCurTarget = pTarget;                                                  \
        int32_t         blkCnt = iWidth;                                                       \
                                                                                               \
        do {                                                                                   \
            uint16x8_t      vecTarget = vld1q(pCurTarget);                                     \
            uint16x8_t      vecTransp = TRGT_LOAD(pAlpha, STRIDE);                             \
            vecTransp = SCAL_OPACITY(vecTransp, OPACITY);                                      \
            uint16x8_t      vecAlpha = vsubq_u16(v256, vecTransp);                             \
            uint16x8_t      vecR, vecG, vecB;                                                  \
                                                                                               \
            __arm_2d_rgb565_unpack_single_vec(vecTarget, &vecR, &vecG, &vecB);                 \
                                                                                               \
            /* blending using alpha vector weights */                                          \
            vecR = vmulq(vecR, vecAlpha);                                                      \
            vecR = vmlaq(vecR, vecTransp, (uint16_t) tSrcPix.R);                               \
            vecR = vecR >> 8;                                                                  \
                                                                                               \
            vecG = vmulq(vecG, vecAlpha);                                                      \
            vecG = vmlaq(vecG, vecTransp, (uint16_t) tSrcPix.G);                               \
            vecG = vecG >> 8;                                                                  \
                                                                                               \
            vecB = vmulq(vecB, vecAlpha);                                                      \
            vecB = vmlaq(vecB, vecTransp, (uint16_t) tSrcPix.B);                               \
            vecB = vecB >> 8;                                                                  \
                                                                                               \
            vecTarget = __arm_2d_rgb565_pack_single_vec(vecR, vecG, vecB);                     \
                                                                                               \
            /* tail predication */                                                             \
            vst1q_p_u16(pCurTarget, vecTarget, vctp16q(blkCnt));                               \
                                                                                               \
            pAlpha += (8 *  ALPHA_SZ);                                                         \
            pCurTarget += 8;                                                                   \
            blkCnt -= 8;                                                                       \
        }                                                                                      \
        while (blkCnt > 0);                                                                    \
                                                                                               \
        P_ALPHA += (iAlphaStride);                                                             \
        pTarget += (iTargetStride);                                                            \
    }


/**
  CCCN888 pixel color fill alpha/channel mask with/without opacity MVE intrinsic generator
   - TRGT_LOAD is a contigous / strided target load function
        CCCN888_TRGT_LOAD / CCCN888_TRGT_LOAD_STRIDE
  - STRIDE is an optional vector of offset for gather load
  - SCAL_OPACITY is extra alpha scaling function
        CCCN888_SCAL_OPACITY_NONE / CCCN888_SCAL_OPACITY
  - OPACITY is an optinal 8-bit vector with duplicated opacity values
  (need vector format to be used with VMULH.U8)
  - ALPHA_SZ, alpha chan width (1 or 4 for resp. 8 or 32-bit type)

  Macro assumes pTargetCh0/1/2 & pAlpha are already setup
 */

#define CCCN888_COLOUR_FILLING_MASK_INNER_MVE(TRGT_LOAD, STRIDE, SCAL_OPACITY,                 \
                                                                           OPACITY, ALPHA_SZ)  \
        int32_t         blkCnt = iWidth;                                                       \
                                                                                               \
        do {                                                                                   \
            mve_pred16_t    tailPred = vctp16q(blkCnt);                                        \
                                                                                               \
            /* expand chan0, chan1, chan2 */                                                   \
            uint16x8_t      vecTargetC0 = vldrbq_gather_offset_z_u16(pTargetCh0, vStride4Offs, \
                tailPred);                                                                     \
            uint16x8_t      vecTargetC1 = vldrbq_gather_offset_z_u16(pTargetCh1, vStride4Offs, \
                tailPred);                                                                     \
            uint16x8_t      vecTargetC2 = vldrbq_gather_offset_z_u16(pTargetCh2, vStride4Offs, \
                tailPred);                                                                     \
            uint16x8_t      vecTransp = TRGT_LOAD(pAlpha, STRIDE, tailPred);                   \
                                                                                               \
            vecTransp = SCAL_OPACITY(vecTransp, OPACITY, tailPred);                            \
                                                                                               \
            uint16x8_t      vecAlpha = vsubq_x_u16(v256, vecTransp, tailPred);                 \
                                                                                               \
            /*  scale ch0 vector with alpha vector */                                          \
            vecTargetC0 = vmulq_x(vecTargetC0, vecAlpha, tailPred);                            \
            /*  blend ch0 vector with input ch0 color*/                                        \
            vecTargetC0 = vmlaq_m(vecTargetC0, vecTransp, (uint16_t) c0, tailPred);            \
            vecTargetC0 = vecTargetC0 >> 8;                                                    \
                                                                                               \
            /* repeat for ch1 and ch2 */                                                       \
            vecTargetC1 = vmulq_x(vecTargetC1, vecAlpha, tailPred);                            \
            vecTargetC1 = vmlaq_m(vecTargetC1, vecTransp, (uint16_t) c1, tailPred);            \
            vecTargetC1 = vecTargetC1 >> 8;                                                    \
                                                                                               \
            vecTargetC2 = vmulq_x(vecTargetC2, vecAlpha, tailPred);                            \
            vecTargetC2 = vmlaq_m(vecTargetC2, vecTransp, (uint16_t) c2, tailPred);            \
            vecTargetC2 = vecTargetC2 >> 8;                                                    \
                                                                                               \
            /* store and merge chan0, chan1, chan2 */                                          \
            vstrbq_scatter_offset_p_u16(pTargetCh0, vStride4Offs, vecTargetC0, tailPred);      \
            vstrbq_scatter_offset_p_u16(pTargetCh1, vStride4Offs, vecTargetC1, tailPred);      \
            vstrbq_scatter_offset_p_u16(pTargetCh2, vStride4Offs, vecTargetC2, tailPred);      \
                                                                                               \
            pAlpha += 8 * ALPHA_SZ;                                                            \
            pTargetCh0 += 8*4;                                                                 \
            pTargetCh1 += 8*4;                                                                 \
            pTargetCh2 += 8*4;                                                                 \
            blkCnt -= 8;                                                                       \
        }                                                                                      \
        while (blkCnt > 0);


#define C8BIT_TRGT_LOAD(base, stride, pred)             vldrbq_z_u16(base, pred)
#define C8BIT_TRGT_LOAD_STRIDE(base, stride, pred)      vldrbq_gather_offset_z_u16(base, stride, pred);
#define C8BIT_SCAL_OPACITY_NONE(transp, opac, pred)     transp
#define C8BIT_SCAL_OPACITY(transp, opac, pred)          (uint16x8_t) vmulhq_x((uint8x16_t) transp, opac, pred)

#define RGB565_TRGT_LOAD(base, stride)                  vldrbq_u16(base)
#define RGB565_TRGT_LOAD_STRIDE(base, stride)           vldrbq_gather_offset_u16(base, stride);
#define RGB565_SCAL_OPACITY_NONE(transp, opac)          transp
#define RGB565_SCAL_OPACITY(transp, opac)               (uint16x8_t) vmulhq((uint8x16_t) transp, opac)

#define CCCN888_TRGT_LOAD(base, stride, pred)           vldrbq_z_u16(base, pred)
#define CCCN888_TRGT_LOAD_STRIDE(base, stride, pred)    vldrbq_gather_offset_z_u16(base, stride, pred);
#define CCCN888_SCAL_OPACITY_NONE(transp, opac, pred)   transp
#define CCCN888_SCAL_OPACITY(transp, opac, pred)        (uint16x8_t) vmulhq_x((uint8x16_t) transp, opac, pred)


__OVERRIDE_WEAK
void __arm_2d_impl_gray8_colour_filling_mask(uint8_t * __RESTRICT pTarget,
                                                       int16_t iTargetStride,
                                                       uint8_t * __RESTRICT pchAlpha,
                                                       int16_t iAlphaStride,
                                                       arm_2d_size_t * __RESTRICT ptCopySize,
                                                       uint8_t Colour)
{
    int_fast16_t    iHeight = ptCopySize->iHeight;
    int_fast16_t    iWidth = ptCopySize->iWidth;
    uint16x8_t      v256 = vdupq_n_u16(256);

    for (int_fast16_t y = 0; y < iHeight; y++) {
        const uint8_t * pAlpha = pchAlpha;
        uint8_t *       pTarget8 = pTarget;

#ifdef USE_MVE_INTRINSICS
        C8BIT_COLOUR_FILLING_MASK_INNER_MVE(C8BIT_TRGT_LOAD, _,
                                        C8BIT_SCAL_OPACITY_NONE, _, 1);
#else
        register unsigned blkCnt __asm("lr");
        blkCnt = iWidth;

    __asm volatile(
            ".p2align 2                                              \n"
            "   vldrb.u16               q0, [%[pTarget]]             \n"
            "   vldrb.u16               q1, [%[pAlpha]], #8          \n"
            "   wlstp.16                lr, %[loopCnt], 1f           \n"
            "2:                                                      \n"
            "   vsub.i16                q2, %[vec256], q1            \n"
            "   vmul.u16                q3, q0, q2                   \n"
            "   vldrb.u16               q0, [%[pTarget], #8]         \n"
            "   vmla.u16                q3, q1, %[Colour]            \n"
            "   vldrb.u16               q1, [%[pAlpha]], #8          \n"
            "   vshr.u16                q3, q3, #8                   \n"
            "   vstrb.u16               q3, [%[pTarget]], #8         \n"
            "   letp                    lr, 2b                       \n"
            "1:                                                      \n"

            : [pTarget] "+r"(pTarget8),  [pAlpha] "+r" (pAlpha), [loopCnt] "+r"(blkCnt)
            :[vec256] "t"   (v256),[Colour] "r"(Colour)
            :"q0", "q1", "q2", "q3", "memory");

#endif
        pchAlpha += (iAlphaStride);
        pTarget += (iTargetStride);
    }
}

__OVERRIDE_WEAK
void __arm_2d_impl_gray8_colour_filling_mask_opacity(uint8_t * __RESTRICT pTarget,
                                                               int16_t iTargetStride,
                                                               uint8_t * __RESTRICT pchAlpha,
                                                               int16_t iAlphaStride,
                                                               arm_2d_size_t *
                                                               __RESTRICT ptCopySize,
                                                               uint8_t Colour, uint8_t chOpacity)
{
    int_fast16_t    iHeight = ptCopySize->iHeight;
    int_fast16_t    iWidth = ptCopySize->iWidth;
    uint8x16_t      vOpacity = vdupq_n_u8(chOpacity);
    uint16x8_t      v256 = vdupq_n_u16(256);

    for (int_fast16_t y = 0; y < iHeight; y++) {
        const uint8_t * pAlpha = pchAlpha;
        uint8_t *       pTarget8 = pTarget;

#ifdef USE_MVE_INTRINSICS
        C8BIT_COLOUR_FILLING_MASK_INNER_MVE(C8BIT_TRGT_LOAD, _,
                                        C8BIT_SCAL_OPACITY, vOpacity, 1);
#else
        register unsigned blkCnt __asm("lr");
        blkCnt = iWidth;

    __asm volatile(
        ".p2align 2                                              \n"
        "   vldrb.u16               q0, [%[pTarget]]             \n"
        "   vldrb.u16               q1, [%[pAlpha]], #8          \n"
        "   vmulh.u8                q1, q1, %[vOpacity]          \n"
        "   wlstp.16                lr, %[loopCnt], 1f           \n"
        "2:                                                      \n"
        "   vsub.i16                q2, %[vec256], q1            \n"
        "   vmul.u16                q3, q0, q2                   \n"
        "   vldrb.u16               q0, [%[pTarget], #8]         \n"
        "   vmla.u16                q3, q1, %[Colour]            \n"
        "   vldrb.u16               q1, [%[pAlpha]], #8          \n"
        "   vmulh.u8                q1, q1, %[vOpacity]          \n"
        "   vshr.u16                q3, q3, #8                   \n"
        "   vstrb.u16               q3, [%[pTarget]], #8         \n"
        "   letp                    lr, 2b                       \n"
        "1:                                                      \n"

        : [pTarget] "+r"(pTarget8),  [pAlpha] "+r" (pAlpha), [loopCnt] "+r"(blkCnt)
        :[vec256] "t"   (v256),[Colour] "r"(Colour),[vOpacity] "t"(vOpacity)
        :"q0", "q1", "q2", "q3", "memory");

#endif
        pchAlpha += (iAlphaStride);
        pTarget += (iTargetStride);
    }
}


__OVERRIDE_WEAK
void __arm_2d_impl_gray8_colour_filling_channel_mask(uint8_t * __RESTRICT pTarget,
                                                         int16_t iTargetStride,
                                                         uint32_t * __RESTRICT pwAlpha,
                                                         int16_t iAlphaStride,
                                                         arm_2d_size_t * __RESTRICT ptCopySize,
                                                         uint8_t Colour)
{
    int_fast16_t    iHeight = ptCopySize->iHeight;
    int_fast16_t    iWidth = ptCopySize->iWidth;
    uint16x8_t      v256 = vdupq_n_u16(256);
    uint16x8_t      vStride4Offs = vidupq_n_u16(0, 4);

    for (int_fast16_t y = 0; y < iHeight; y++) {
        const uint8_t  *pAlpha = (const uint8_t *)pwAlpha;
        uint8_t *       pTarget8 = pTarget;

#ifdef USE_MVE_INTRINSICS
        C8BIT_COLOUR_FILLING_MASK_INNER_MVE(C8BIT_TRGT_LOAD_STRIDE, vStride4Offs,
                                        C8BIT_SCAL_OPACITY_NONE, _, 4);
#else
        register unsigned blkCnt __asm("lr");
        blkCnt = iWidth;

    __asm volatile(
        ".p2align 2                                              \n"
        "   vldrb.u16               q0, [%[pTarget]]             \n"
        "   vldrb.u16               q1, [%[pAlpha], %[str4Offs]] \n"
        "   wlstp.16                lr, %[loopCnt], 1f           \n"
        "2:                                                      \n"
        "   vsub.i16                q2, %[vec256], q1            \n"
        "   vmul.u16                q3, q0, q2                   \n"
        "   add                     %[pAlpha], %[pAlpha], #(8*4) \n"
        "   vldrb.u16               q0, [%[pTarget], #8]         \n"
        "   vmla.u16                q3, q1, %[Colour]            \n"
        "   vldrb.u16               q1, [%[pAlpha], %[str4Offs]] \n"
        "   vshr.u16                q3, q3, #8                   \n"
        "   vstrb.u16               q3, [%[pTarget]], #8         \n"
        "   letp                    lr, 2b                       \n"
        "1:                                                      \n"

        : [pTarget] "+r"(pTarget8),  [pAlpha] "+r" (pAlpha), [loopCnt] "+r"(blkCnt)
        :[vec256] "t"   (v256),[Colour] "r"(Colour),[str4Offs] "t"(vStride4Offs)
        :"q0", "q1", "q2", "q3", "memory");

#endif
        pwAlpha += (iAlphaStride);
        pTarget += (iTargetStride);
    }
}


__OVERRIDE_WEAK
void __arm_2d_impl_gray8_colour_filling_channel_mask_opacity(uint8_t * __RESTRICT pTarget,
                                                                 int16_t iTargetStride,
                                                                 uint32_t * __RESTRICT pwAlpha,
                                                                 int16_t iAlphaStride,
                                                                 arm_2d_size_t *
                                                                 __RESTRICT ptCopySize,
                                                                 uint8_t Colour, uint8_t chOpacity)
{
    int_fast16_t    iHeight = ptCopySize->iHeight;
    int_fast16_t    iWidth = ptCopySize->iWidth;
    uint8x16_t      vOpacity = vdupq_n_u8(chOpacity);
    uint16x8_t      v256 = vdupq_n_u16(256);
    uint16x8_t      vStride4Offs = vidupq_n_u16(0, 4);

    for (int_fast16_t y = 0; y < iHeight; y++) {
        const uint8_t  *pAlpha = (const uint8_t *)pwAlpha;
        uint8_t        *pTarget8 = pTarget;

#ifdef USE_MVE_INTRINSICS
        C8BIT_COLOUR_FILLING_MASK_INNER_MVE(C8BIT_TRGT_LOAD_STRIDE, vStride4Offs,
                                        C8BIT_SCAL_OPACITY, vOpacity, 4);
#else
        register unsigned blkCnt __asm("lr");
        blkCnt = iWidth;

    __asm volatile(
        ".p2align 2                                              \n"
        "   vldrb.u16               q0, [%[pTarget]]             \n"
        "   vldrb.u16               q1, [%[pAlpha], %[str4Offs]] \n"
        "   vmulh.u8                q1, q1, %[vOpacity]          \n"
        "   wlstp.16                lr, %[loopCnt], 1f           \n"
        "2:                                                      \n"
        "   vsub.i16                q2, %[vec256], q1            \n"
        "   vmul.u16                q3, q0, q2                   \n"
        "   vldrb.u16               q0, [%[pTarget], #8]         \n"
        "   add                     %[pAlpha], %[pAlpha], #(8*4) \n"
        "   vmla.u16                q3, q1, %[Colour]            \n"
        "   vldrb.u16               q1, [%[pAlpha], %[str4Offs]] \n"
        "   vmulh.u8                q1, q1, %[vOpacity]          \n"
        "   vshr.u16                q3, q3, #8                   \n"
        "   vstrb.u16               q3, [%[pTarget]], #8         \n"
        "   letp                    lr, 2b                       \n"
        "1:                                                      \n"

        : [pTarget] "+r"(pTarget8),  [pAlpha] "+r" (pAlpha), [loopCnt] "+r"(blkCnt)
        :[vec256] "t"   (v256),[Colour] "r"(Colour),[vOpacity] "t"(vOpacity),
         [str4Offs] "t"(vStride4Offs)
        :"q0", "q1", "q2", "q3", "memory");

#endif
        pwAlpha += (iAlphaStride);
        pTarget += (iTargetStride);
    }
}


__OVERRIDE_WEAK
void __arm_2d_impl_rgb565_colour_filling_mask(uint16_t * __RESTRICT pTarget,
                                                    int16_t iTargetStride,
                                                    uint8_t * __RESTRICT pchAlpha,
                                                    int16_t iAlphaStride,
                                                    arm_2d_size_t * __RESTRICT ptCopySize,
                                                    uint16_t Colour)
{
    int_fast16_t    iHeight = ptCopySize->iHeight;
    int_fast16_t    iWidth = ptCopySize->iWidth;
    __arm_2d_color_fast_rgb_t tSrcPix;

    __arm_2d_rgb565_unpack(*(&Colour), &tSrcPix);

#ifdef USE_MVE_INTRINSICS
    RGB565_COLOUR_FILLING_MASK_MVE(RGB565_TRGT_LOAD, _,
                                            RGB565_SCAL_OPACITY_NONE, _, pchAlpha, 1);
#else
    /* RGB565 pack/unpack Masks */
    /* use memory rather than vmov to optimize Helium operations interleaving */
    uint16x8_t scratch[4];

    // Unpacking Mask Red
    vst1q((uint16_t*)&scratch[0], vdupq_n_u16(0x1f));
    // Unpacking Mask Green
    vst1q((uint16_t*)&scratch[1], vdupq_n_u16(0x3f));
    // packing Mask Green
    vst1q((uint16_t*)&scratch[2], vdupq_n_u16(0xfc));
    // packing Mask Blue
    vst1q((uint16_t*)&scratch[3], vdupq_n_u16(0xf8));

    for (int_fast16_t y = 0; y < iHeight; y++) {
        const uint8_t  *pAlpha = pchAlpha;
        uint16_t       *pCurTarget = pTarget;
        register unsigned blkCnt __asm("lr");
        blkCnt = iWidth;

       __asm volatile  (
            ".p2align 2                                            \n"
            /* load scheduling */
            "   vldrh.u16               q0, [%[pTarget]]           \n"
            "   vmov.i16                q7, #0x0100                \n"
            "   vldrb.u16               q1, [%[pAlpha]], #8        \n"

            "   wlstp.16                lr, %[loopCnt], 1f         \n"
            "2:                                                    \n"

            // vecAlpha
            "   vsub.i16                q2, q7, q1                 \n"

            /*  RGB565 unpack          */

            /* vecAlpha * 4 for G channel upscale */
            "   vmul.i16                q2, q2, %[four]            \n"
            /* G channel extract */
            "   vshr.u16                q5, q0, #5                 \n"

            /* load Unpacking Mask for R channel */
            "   vldrh.u16               q7, [%[scratch], #(0*16)]  \n"
            "   vand                    q4, q0, q7                 \n"

            /* load Unpacking Mask for G channel */
            "   vldrh.u16               q7, [%[scratch], #(1*16)]  \n"
            "   vand                    q5, q5, q7                 \n"
            /*  scale G vector with alpha vector */
            "   vmul.u16                q5, q5, q2                 \n"

            /* B channel */
            "   vshr.u16                q6, q0, #11                \n"

            /*  blend G vector with input G color*/
            "   vmla.u16                q5, q1, %[G]               \n"

            /* vecAlpha * 8 for R & B  upscale */
            "   vshl.i16                q2, q2, #1                 \n"

            /*  scale R vector with alpha vec */
            "   vmul.u16                q4, q4, q2                 \n"

            "   vshr.u16                q5, q5, #8                 \n"

            /*  blend R vector with input R color*/
            "   vmla.u16                q4, q1, %[R]               \n"

            /* load packing Mask for G channel */
            "   vldrh.u16               q7, [%[scratch], #(2*16)]  \n"

            /*  scale B vector with alpha vector */
            "   vmul.u16                q6, q6, q2                 \n"
            "   vand                    q5, q5, q7                 \n"

            /*  blend B vector with input B color*/
            "   vmla.u16                q6, q1, %[B]               \n"

            /* load packing Mask for B channel */
            "   vldrh.u16               q7, [%[scratch], #(3*16)]  \n"

            "   vshr.u16                q6, q6, #8                 \n"

           /* RGB 565 pack */

            /* (G & 0x00fc), 8) */
            "   vmul.i16                q5, q5, %[eight]           \n"

            /* (B & 0x00f8) */
            "   vand                    q6, q6, q7                 \n"

            /* load next alpha vector */
            "   vldrb.u16               q1, [%[pAlpha]], #8        \n"
            "   vmov.i16                q7, #0x0100                \n"

            /* pack G & B */
            "   vmla.u16                q5, q6, %[twofiftysix]     \n"
            /* combined (R >> 8) >> 3 */
            "   vshr.u16                q4, q4, #11                \n"

            /* load next target */
            "   vldrh.u16               q0, [%[pTarget], #16]      \n"

            /*  pack R */
            "   vorr                    q4, q4, q5                 \n"
            "   vstrh.16                q4, [%[pTarget]], #16      \n"
            "   letp                    lr, 2b                     \n"
            "1:                                                    \n"
            :[pTarget]"+r"(pCurTarget),[pAlpha] "+r"(pAlpha),[loopCnt] "+r"(blkCnt)
            :[Colour] "r"(Colour), [eight] "r" (8), [four] "r" (4),
            [R] "r" (tSrcPix.R), [G] "r" (tSrcPix.G), [B] "r" (tSrcPix.B),
            [twofiftysix] "r" (256), [scratch] "r" (scratch)
            :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory");

        pchAlpha += (iAlphaStride);
        pTarget += (iTargetStride);
    }
#endif
}


__OVERRIDE_WEAK
void __arm_2d_impl_rgb565_colour_filling_mask_opacity(uint16_t * __RESTRICT pTarget,
                                                    int16_t iTargetStride,
                                                    uint8_t * __RESTRICT pchAlpha,
                                                    int16_t iAlphaStride,
                                                    arm_2d_size_t * __RESTRICT ptCopySize,
                                                    uint16_t Colour, uint8_t chOpacity)
{
    int_fast16_t    iHeight = ptCopySize->iHeight;
    int_fast16_t    iWidth = ptCopySize->iWidth;
    uint8x16_t      vOpacity = vdupq_n_u8(chOpacity);

    __arm_2d_color_fast_rgb_t tSrcPix;

    __arm_2d_rgb565_unpack(*(&Colour), &tSrcPix);

#ifdef USE_MVE_INTRINSICS
    RGB565_COLOUR_FILLING_MASK_MVE(RGB565_TRGT_LOAD, _,
                                            RGB565_SCAL_OPACITY, vOpacity, pchAlpha, 1);
#else
    /* RGB565 pack/unpack Masks + opacity */
    /* use memory rather than vmov to optimize Helium operations interleaving */
    uint16x8_t scratch[5];

    // Unpacking Mask Red
    vst1q((uint16_t*)&scratch[0], vdupq_n_u16(0x1f));
    // Unpacking Mask Green
    vst1q((uint16_t*)&scratch[1], vdupq_n_u16(0x3f));
    // packing Mask Green
    vst1q((uint16_t*)&scratch[2], vdupq_n_u16(0xfc));
    // packing Mask Blue
    vst1q((uint16_t*)&scratch[3], vdupq_n_u16(0xf8));
    // opacity
    vst1q((uint16_t*)&scratch[4], (uint16x8_t)vOpacity);


    for (int_fast16_t y = 0; y < iHeight; y++) {
        const uint8_t  *pAlpha = pchAlpha;
        uint16_t       *pCurTarget = pTarget;
        register unsigned blkCnt __asm("lr");
        blkCnt = iWidth;

       __asm volatile  (
            ".p2align 2                                            \n"
            /* load scheduling */
            "   vldrh.u16               q0, [%[pTarget]]           \n"
            "   vmov.i16                q7, #0x0100                \n"
            "   vldrb.u16               q1, [%[pAlpha]], #8        \n"
            /* opacity vector */
            "   vldrh.u16               q6, [%[scratch], #(4*16)]  \n"
            "   vmulh.u8                q1, q1, q6                 \n"

            "   wlstp.16                lr, %[loopCnt], 1f         \n"
            "2:                                                    \n"

            // vecAlpha
            "   vsub.i16                q2, q7, q1                 \n"

            /*  RGB565 unpack          */

            /* vecAlpha * 4 for G channel upscale */
            "   vmul.i16                q2, q2, %[four]            \n"
            /* G channel extract */
            "   vshr.u16                q5, q0, #5                 \n"

            /* load Unpacking Mask for R channel */
            "   vldrh.u16               q7, [%[scratch], #(0*16)]  \n"
            "   vand                    q4, q0, q7                 \n"

            /* load Unpacking Mask for G channel */
            "   vldrh.u16               q7, [%[scratch], #(1*16)]  \n"
            "   vand                    q5, q5, q7                 \n"
            /*  scale G vector with alpha vector */
            "   vmul.u16                q5, q5, q2                 \n"

            /* B channel */
            "   vshr.u16                q6, q0, #11                \n"

            /*  blend G vector with input G color*/
            "   vmla.u16                q5, q1, %[G]               \n"

            /* vecAlpha * 8 for R & B  upscale */
            "   vshl.i16                q2, q2, #1                 \n"

            /*  scale R vector with alpha vec */
            "   vmul.u16                q4, q4, q2                 \n"

            "   vshr.u16                q5, q5, #8                 \n"

            /*  blend R vector with input R color*/
            "   vmla.u16                q4, q1, %[R]               \n"

            /* load packing Mask for G channel */
            "   vldrh.u16               q7, [%[scratch], #(2*16)]  \n"

            /*  scale B vector with alpha vector */
            "   vmul.u16                q6, q6, q2                 \n"
            "   vand                    q5, q5, q7                 \n"

            /*  blend B vector with input B color*/
            "   vmla.u16                q6, q1, %[B]               \n"

            /* load packing Mask for B channel */
            "   vldrh.u16               q7, [%[scratch], #(3*16)]  \n"

            "   vshr.u16                q6, q6, #8                 \n"

           /* RGB 565 pack */

            /* (G & 0x00fc), 8) */
            "   vmul.i16                q5, q5, %[eight]           \n"

            /* (B & 0x00f8) */
            "   vand                    q6, q6, q7                 \n"

            /* load next alpha vector */
            "   vldrb.u16               q1, [%[pAlpha]], #8        \n"
            "   vmov.i16                q7, #0x0100                \n"

            /* pack G & B */
            "   vmla.u16                q5, q6, %[twofiftysix]     \n"
            /* reload opacity and scale alpha */
            "   vldrh.u16               q6, [%[scratch], #(4*16)]  \n"
            "   vmulh.u8                q1, q1, q6                 \n"

            /* combined (R >> 8) >> 3 */
            "   vshr.u16                q4, q4, #11                \n"

            /* load next target */
            "   vldrh.u16               q0, [%[pTarget], #16]      \n"

            /*  pack R */
            "   vorr                    q4, q4, q5                 \n"
            "   vstrh.16                q4, [%[pTarget]], #16      \n"
            "   letp                    lr, 2b                     \n"
            "1:                                                    \n"
            :[pTarget]"+r"(pCurTarget),[pAlpha] "+r"(pAlpha),[loopCnt] "+r"(blkCnt)
            :[Colour] "r"(Colour), [eight] "r" (8), [four] "r" (4),
            [R] "r" (tSrcPix.R), [G] "r" (tSrcPix.G), [B] "r" (tSrcPix.B),
            [twofiftysix] "r" (256), [scratch] "r" (scratch)
            :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory");

        pchAlpha += (iAlphaStride);
        pTarget += (iTargetStride);
    }
#endif
}


__OVERRIDE_WEAK
void __arm_2d_impl_rgb565_colour_filling_channel_mask(uint16_t * __RESTRICT pTarget,
                                                      int16_t iTargetStride,
                                                      uint32_t * __RESTRICT pwAlpha,
                                                      int16_t iAlphaStride,
                                                      arm_2d_size_t * __RESTRICT ptCopySize,
                                                      uint16_t Colour)
{
    int_fast16_t    iHeight = ptCopySize->iHeight;
    int_fast16_t    iWidth = ptCopySize->iWidth;
    uint16x8_t      vStride4Offs = vidupq_n_u16(0, 4);
    __arm_2d_color_fast_rgb_t tSrcPix;

    __arm_2d_rgb565_unpack(*(&Colour), &tSrcPix);

#ifdef USE_MVE_INTRINSICS
    RGB565_COLOUR_FILLING_MASK_MVE(RGB565_TRGT_LOAD_STRIDE, vStride4Offs,
                                            RGB565_SCAL_OPACITY_NONE, _, pwAlpha, 4);
#else
    /* RGB565 pack/unpack Masks  */
    /* use memory rather than vmov to optimize Helium operations interleaving */
    uint16x8_t scratch[4];

    // Unpacking Mask Red
    vst1q((uint16_t*)&scratch[0], vdupq_n_u16(0x1f));
    // Unpacking Mask Green
    vst1q((uint16_t*)&scratch[1], vdupq_n_u16(0x3f));
    // packing Mask Green
    vst1q((uint16_t*)&scratch[2], vdupq_n_u16(0xfc));
    // packing Mask Blue
    vst1q((uint16_t*)&scratch[3], vdupq_n_u16(0xf8));

    for (int_fast16_t y = 0; y < iHeight; y++) {
        const uint32_t  *pAlpha = pwAlpha;
        uint16_t       *pCurTarget = pTarget;
        register unsigned blkCnt __asm("lr");
        blkCnt = iWidth;

       __asm volatile  (
            ".p2align 2                                            \n"
            /* load scheduling */
            "   vldrh.u16               q0, [%[pTarget]]           \n"

            "   vmov.i16                q7, #0x0100                \n"
            "   vldrb.u16               q1, [%[pAlpha],%[str4Offs]]\n"

            "   wlstp.16                lr, %[loopCnt], 1f         \n"
            "2:                                                    \n"
            "   add                     %[pAlpha], %[pAlpha],#(8*4)\n"
            // vecAlpha
            "   vsub.i16                q2, q7, q1                 \n"

            /*  RGB565 unpack          */

            /* vecAlpha * 4 for G channel upscale */
            "   vmul.i16                q2, q2, %[four]            \n"
            /* G channel extract */
            "   vshr.u16                q5, q0, #5                 \n"

            /* load Unpacking Mask for R channel */
            "   vldrh.u16               q7, [%[scratch], #(0*16)]  \n"
            "   vand                    q4, q0, q7                 \n"

            /* load Unpacking Mask for G channel */
            "   vldrh.u16               q7, [%[scratch], #(1*16)]  \n"
            "   vand                    q5, q5, q7                 \n"
            /*  scale G vector with alpha vector */
            "   vmul.u16                q5, q5, q2                 \n"

            /* B channel */
            "   vshr.u16                q6, q0, #11                \n"

            /*  blend G vector with input G color*/
            "   vmla.u16                q5, q1, %[G]               \n"

            /* vecAlpha * 8 for R & B  upscale */
            "   vshl.i16                q2, q2, #1                 \n"

            /*  scale R vector with alpha vec */
            "   vmul.u16                q4, q4, q2                 \n"

            "   vshr.u16                q5, q5, #8                 \n"

            /*  blend R vector with input R color*/
            "   vmla.u16                q4, q1, %[R]               \n"

            /* load packing Mask for G channel */
            "   vldrh.u16               q7, [%[scratch], #(2*16)]  \n"

            /*  scale B vector with alpha vector */
            "   vmul.u16                q6, q6, q2                 \n"
            "   vand                    q5, q5, q7                 \n"

            /*  blend B vector with input B color*/
            "   vmla.u16                q6, q1, %[B]               \n"

            /* load packing Mask for B channel */
            "   vldrh.u16               q7, [%[scratch], #(3*16)]  \n"

            "   vshr.u16                q6, q6, #8                 \n"

           /* RGB 565 pack */

            /* (G & 0x00fc), 8) */
            "   vmul.i16                q5, q5, %[eight]           \n"

            /* (B & 0x00f8) */
            "   vand                    q6, q6, q7                 \n"

            /* load next alpha vector */
            "   vldrb.u16               q1, [%[pAlpha],%[str4Offs]]\n"

            "   vmov.i16                q7, #0x0100                \n"

            /* pack G & B */
            "   vmla.u16                q5, q6, %[twofiftysix]     \n"
            /* combined (R >> 8) >> 3 */
            "   vshr.u16                q4, q4, #11                \n"

            /* load next target */
            "   vldrh.u16               q0, [%[pTarget], #16]      \n"

            /*  pack R */
            "   vorr                    q4, q4, q5                 \n"
            "   vstrh.16                q4, [%[pTarget]], #16      \n"
            "   letp                    lr, 2b                     \n"
            "1:                                                    \n"
            :[pTarget]"+r"(pCurTarget),[pAlpha] "+r"(pAlpha),[loopCnt] "+r"(blkCnt)
            :[Colour] "r"(Colour), [eight] "r" (8), [four] "r" (4),
            [R] "r" (tSrcPix.R), [G] "r" (tSrcPix.G), [B] "r" (tSrcPix.B),
            [twofiftysix] "r" (256), [scratch] "r" (scratch), [str4Offs] "t"(vStride4Offs)
            :"q0", "q1", "q2", "q4", "q5", "q6", "q7", "memory");

        pwAlpha += (iAlphaStride);
        pTarget += (iTargetStride);
    }
#endif
}


__OVERRIDE_WEAK
void __arm_2d_impl_rgb565_colour_filling_channel_mask_opacity(uint16_t * __RESTRICT pTarget,
                                                              int16_t iTargetStride,
                                                              uint32_t * __RESTRICT pwAlpha,
                                                              int16_t iAlphaStride,
                                                              arm_2d_size_t * __RESTRICT ptCopySize,
                                                              uint16_t Colour, uint8_t chOpacity)
{
    int_fast16_t    iHeight = ptCopySize->iHeight;
    int_fast16_t    iWidth = ptCopySize->iWidth;
    uint16x8_t      vStride4Offs = vidupq_n_u16(0, 4);
    uint8x16_t      vOpacity = vdupq_n_u8(chOpacity);
    __arm_2d_color_fast_rgb_t tSrcPix;

    __arm_2d_rgb565_unpack(*(&Colour), &tSrcPix);

#ifdef USE_MVE_INTRINSICS

    RGB565_COLOUR_FILLING_MASK_MVE(RGB565_TRGT_LOAD_STRIDE, vStride4Offs,
                                            RGB565_SCAL_OPACITY, vOpacity, pwAlpha, 4);
#else
    /* RGB565 pack/unpack Masks + opacity */
    /* use memory rather than vmov to optimize Helium operations interleaving */
    uint16x8_t scratch[5];

    // Unpacking Mask Red
    vst1q((uint16_t*)&scratch[0], vdupq_n_u16(0x1f));
    // Unpacking Mask Green
    vst1q((uint16_t*)&scratch[1], vdupq_n_u16(0x3f));
    // packing Mask Green
    vst1q((uint16_t*)&scratch[2], vdupq_n_u16(0xfc));
    // packing Mask Blue
    vst1q((uint16_t*)&scratch[3], vdupq_n_u16(0xf8));
    // opacity
    vst1q((uint16_t*)&scratch[4], (uint16x8_t)vOpacity);

    for (int_fast16_t y = 0; y < iHeight; y++) {
        const uint32_t  *pAlpha = pwAlpha;
        uint16_t       *pCurTarget = pTarget;
        register unsigned blkCnt __asm("lr");
        blkCnt = iWidth;

       __asm volatile  (
            ".p2align 2                                            \n"
            /* load scheduling */
            "   vldrh.u16               q0, [%[pTarget]]           \n"

            "   vmov.i16                q7, #0x0100                \n"
            "   vldrb.u16               q1, [%[pAlpha],%[str4Offs]]\n"
            /* opacity vector */
            "   vldrh.u16               q6, [%[scratch], #(4*16)]  \n"
            "   vmulh.u8                q1, q1, q6                 \n"

            "   wlstp.16                lr, %[loopCnt], 1f         \n"
            "2:                                                    \n"
            "   add                     %[pAlpha], %[pAlpha],#(8*4)\n"
            // vecAlpha
            "   vsub.i16                q2, q7, q1                 \n"

            /*  RGB565 unpack          */

            /* vecAlpha * 4 for G channel upscale */
            "   vmul.i16                q2, q2, %[four]            \n"
            /* G channel extract */
            "   vshr.u16                q5, q0, #5                 \n"

            /* load Unpacking Mask for R channel */
            "   vldrh.u16               q7, [%[scratch], #(0*16)]  \n"
            "   vand                    q4, q0, q7                 \n"

            /* load Unpacking Mask for G channel */
            "   vldrh.u16               q7, [%[scratch], #(1*16)]  \n"
            "   vand                    q5, q5, q7                 \n"
            /*  scale G vector with alpha vector */
            "   vmul.u16                q5, q5, q2                 \n"

            /* B channel */
            "   vshr.u16                q6, q0, #11                \n"

            /*  blend G vector with input G color*/
            "   vmla.u16                q5, q1, %[G]               \n"

            /* vecAlpha * 8 for R & B  upscale */
            "   vshl.i16                q2, q2, #1                 \n"

            /*  scale R vector with alpha vec */
            "   vmul.u16                q4, q4, q2                 \n"

            "   vshr.u16                q5, q5, #8                 \n"

            /*  blend R vector with input R color*/
            "   vmla.u16                q4, q1, %[R]               \n"

            /* load packing Mask for G channel */
            "   vldrh.u16               q7, [%[scratch], #(2*16)]  \n"

            /*  scale B vector with alpha vector */
            "   vmul.u16                q6, q6, q2                 \n"
            "   vand                    q5, q5, q7                 \n"

            /*  blend B vector with input B color*/
            "   vmla.u16                q6, q1, %[B]               \n"

            /* load packing Mask for B channel */
            "   vldrh.u16               q7, [%[scratch], #(3*16)]  \n"

            "   vshr.u16                q6, q6, #8                 \n"

           /* RGB 565 pack */

            /* (G & 0x00fc), 8) */
            "   vmul.i16                q5, q5, %[eight]           \n"

            /* (B & 0x00f8) */
            "   vand                    q6, q6, q7                 \n"

            /* load next alpha vector */
            "   vldrb.u16               q1, [%[pAlpha],%[str4Offs]]\n"

            "   vmov.i16                q7, #0x0100                \n"

            /* pack G & B */
            "   vmla.u16                q5, q6, %[twofiftysix]     \n"
            /* combined (R >> 8) >> 3 */
            "   vldrh.u16               q6, [%[scratch], #(4*16)]  \n"
            "   vmulh.u8                q1, q1, q6                 \n"
            "   vshr.u16                q4, q4, #11                \n"

            /* load next target */
            "   vldrh.u16               q0, [%[pTarget], #16]      \n"

            /*  pack R */
            "   vorr                    q4, q4, q5                 \n"
            "   vstrh.16                q4, [%[pTarget]], #16      \n"
            "   letp                    lr, 2b                     \n"
            "1:                                                    \n"
            :[pTarget]"+r"(pCurTarget),[pAlpha] "+r"(pAlpha),[loopCnt] "+r"(blkCnt)
            :[Colour] "r"(Colour), [eight] "r" (8), [four] "r" (4),
            [R] "r" (tSrcPix.R), [G] "r" (tSrcPix.G), [B] "r" (tSrcPix.B),
            [twofiftysix] "r" (256), [scratch] "r" (scratch), [str4Offs] "t"(vStride4Offs)
            :"q0", "q1", "q2", "q4", "q5", "q6", "q7", "memory");

        pwAlpha += (iAlphaStride);
        pTarget += (iTargetStride);
    }
#endif
}


__OVERRIDE_WEAK
void __arm_2d_impl_cccn888_colour_filling_mask(uint32_t * __RESTRICT pTarget,
                                                     int16_t iTargetStride,
                                                     uint8_t * __RESTRICT pchAlpha,
                                                     int16_t iAlphaStride,
                                                     arm_2d_size_t * __RESTRICT ptCopySize,
                                                     uint32_t Colour)
{
    int_fast16_t    iHeight = ptCopySize->iHeight;
    int_fast16_t    iWidth = ptCopySize->iWidth;
    uint16x8_t      v256 = vdupq_n_u16(256);
    uint16x8_t      vStride4Offs = vidupq_n_u16(0, 4);
    uint16_t        c0, c1, c2;

    c0 = Colour & 0xff;
    c1 = (Colour >> 8) & 0xff;
    c2 = (Colour >> 16) & 0xff;

    for (int_fast16_t y = 0; y < iHeight; y++) {
        const uint8_t * pAlpha = pchAlpha;
        uint8_t *       pTargetCh0 = (uint8_t*)pTarget;
        uint8_t *       pTargetCh1 = pTargetCh0 + 1;
        uint8_t *       pTargetCh2 = pTargetCh0 + 2;

#ifdef USE_MVE_INTRINSICS

        CCCN888_COLOUR_FILLING_MASK_INNER_MVE(CCCN888_TRGT_LOAD, _,
                                        CCCN888_SCAL_OPACITY_NONE, _, 1);
#else

        register unsigned blkCnt __asm("lr");
        blkCnt = iWidth;

    __asm volatile(
            ".p2align 2                                                   \n"
            /* expand chan0 */
            "   vldrb.u16               q0, [%[pTargetCh0], %[str4Offs]]  \n"
            "   vldrb.u16               q1, [%[pAlpha]], #8               \n"

            "   wlstp.16                lr, %[loopCnt], 1f                \n"
            "2:                                                           \n"

            "   vsub.i16                q2, %[vec256], q1                 \n"

            /*  scale ch0 vector with alpha vector */
            "   vmul.u16                q3, q0, q2                        \n"

            /* expand chan1 */
            "   vldrb.u16               q0, [%[pTargetCh1], %[str4Offs]]  \n"
            /*  blend ch0 vector with input ch0 color*/
            "   vmla.u16                q3, q1, %[c0]                     \n"
            "   vshr.u16                q3, q3, #8                        \n"

            "   vstrb.u16               q3, [%[pTargetCh0], %[str4Offs]]  \n"

            /*  scale ch1 vector with alpha vector */
            "   vmul.u16                q3, q0, q2                        \n"

            /* expand chan2 */
            "   vldrb.u16               q0, [%[pTargetCh2], %[str4Offs]]  \n"
            /*  blend ch1 vector with input ch1 color*/
            "   vmla.u16                q3, q1, %[c1]                     \n"
            "   vshr.u16                q3, q3, #8                        \n"
            "   vstrb.u16               q3, [%[pTargetCh1], %[str4Offs]]  \n"

            "   adds                    %[pTargetCh0], #32                \n"
            "   adds                    %[pTargetCh1], #32                \n"

            /*  scale ch2 vector with alpha vector */
            "   vmul.u16                q3, q0, q2                        \n"
            "   vldrb.u16               q0, [%[pTargetCh0], %[str4Offs]]  \n"

            /*  blend ch2 vector with input ch2 color*/
            "   vmla.u16                q3, q1, %[c2]                     \n"
            "   vldrb.u16               q1, [%[pAlpha]], #8               \n"

            "   vshr.u16                q3, q3, #8                        \n"
            "   vstrb.u16               q3, [%[pTargetCh2], %[str4Offs]]  \n"

            "   add.w                   %[pTargetCh2], %[pTargetCh2], #32 \n"

            "   letp                    lr, 2b                            \n"
            "1:                                                           \n"

            :[pTargetCh0] "+r"(pTargetCh0),  [pTargetCh1] "+r"(pTargetCh1),
             [pTargetCh2] "+r"(pTargetCh2), [pAlpha] "+r" (pAlpha), [loopCnt] "+r"(blkCnt)
            :[vec256] "t" (v256),[str4Offs] "t" (vStride4Offs),
             [c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2)
            :"q0", "q1", "q2", "q3", "memory");

#endif
        pchAlpha += (iAlphaStride);
        pTarget += (iTargetStride);
    }
}


__OVERRIDE_WEAK
void __arm_2d_impl_cccn888_colour_filling_mask_opacity(uint32_t * __RESTRICT pTarget,
                                                             int16_t iTargetStride,
                                                             uint8_t * __RESTRICT pchAlpha,
                                                             int16_t iAlphaStride,
                                                             arm_2d_size_t * __RESTRICT ptCopySize,
                                                             uint32_t Colour, uint8_t chOpacity)
{
    int_fast16_t    iHeight = ptCopySize->iHeight;
    int_fast16_t    iWidth = ptCopySize->iWidth;
    uint16x8_t      v256 = vdupq_n_u16(256);
    uint16x8_t      vStride4Offs = vidupq_n_u16(0, 4);
    uint8x16_t      vOpacity = vdupq_n_u8(chOpacity);
    uint16_t        c0, c1, c2;

    c0 = Colour & 0xff;
    c1 = (Colour >> 8) & 0xff;
    c2 = (Colour >> 16) & 0xff;

    for (int_fast16_t y = 0; y < iHeight; y++) {
        const uint8_t * pAlpha = pchAlpha;
        uint8_t *       pTargetCh0 = (uint8_t*)pTarget;
        uint8_t *       pTargetCh1 = pTargetCh0 + 1;
        uint8_t *       pTargetCh2 = pTargetCh0 + 2;

#ifdef USE_MVE_INTRINSICS

        CCCN888_COLOUR_FILLING_MASK_INNER_MVE(CCCN888_TRGT_LOAD, _,
                                        CCCN888_SCAL_OPACITY, vOpacity, 1);
#else

        register unsigned blkCnt __asm("lr");
        blkCnt = iWidth;

    __asm volatile(
            ".p2align 2                                                   \n"
            /* expand chan0 */
            "   vldrb.u16               q0, [%[pTargetCh0], %[str4Offs]]  \n"
            "   vldrb.u16               q1, [%[pAlpha]], #8               \n"
            "   vmulh.u8                q1, q1, %[vOpacity]               \n"

            "   wlstp.16                lr, %[loopCnt], 1f                \n"
            "2:                                                           \n"

            "   vsub.i16                q2, %[vec256], q1                 \n"

            /*  scale ch0 vector with alpha vector */
            "   vmul.u16                q3, q0, q2                        \n"

            /* expand chan1 */
            "   vldrb.u16               q0, [%[pTargetCh1], %[str4Offs]]  \n"
            /*  blend ch0 vector with input ch0 color*/
            "   vmla.u16                q3, q1, %[c0]                     \n"
            "   vshr.u16                q3, q3, #8                        \n"

            "   vstrb.u16               q3, [%[pTargetCh0], %[str4Offs]]  \n"

            /*  scale ch1 vector with alpha vector */
            "   vmul.u16                q3, q0, q2                        \n"

            /* expand chan2 */
            "   vldrb.u16               q0, [%[pTargetCh2], %[str4Offs]]  \n"
            /*  blend ch1 vector with input ch1 color*/
            "   vmla.u16                q3, q1, %[c1]                     \n"
            "   vshr.u16                q3, q3, #8                        \n"
            "   vstrb.u16               q3, [%[pTargetCh1], %[str4Offs]]  \n"

            "   adds                    %[pTargetCh0], #32                \n"
            "   adds                    %[pTargetCh1], #32                \n"

            /*  scale ch2 vector with alpha vector */
            "   vmul.u16                q3, q0, q2                        \n"
            "   vldrb.u16               q0, [%[pTargetCh0], %[str4Offs]]  \n"

            /*  blend ch2 vector with input ch2 color*/
            "   vmla.u16                q3, q1, %[c2]                     \n"
            "   vldrb.u16               q1, [%[pAlpha]], #8               \n"
            "   vmulh.u8                q1, q1, %[vOpacity]               \n"

            "   vshr.u16                q3, q3, #8                        \n"
            "   vstrb.u16               q3, [%[pTargetCh2], %[str4Offs]]  \n"

            "   add.w                   %[pTargetCh2], %[pTargetCh2], #32 \n"

            "   letp                    lr, 2b                            \n"
            "1:                                                           \n"

            :[pTargetCh0] "+r"(pTargetCh0),  [pTargetCh1] "+r"(pTargetCh1),
             [pTargetCh2] "+r"(pTargetCh2), [pAlpha] "+r" (pAlpha), [loopCnt] "+r"(blkCnt)
            :[vec256] "t" (v256),[str4Offs] "t" (vStride4Offs),
             [vOpacity] "t"(vOpacity),
             [c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2)
            :"q0", "q1", "q2", "q3", "memory");

#endif
        pchAlpha += (iAlphaStride);
        pTarget += (iTargetStride);
    }
}


__OVERRIDE_WEAK
void __arm_2d_impl_cccn888_colour_filling_channel_mask(uint32_t * __RESTRICT pTarget,
                                                       int16_t iTargetStride,
                                                       uint32_t * __RESTRICT pwAlpha,
                                                       int16_t iAlphaStride,
                                                       arm_2d_size_t * __RESTRICT ptCopySize,
                                                       uint32_t Colour)
{
    int_fast16_t    iHeight = ptCopySize->iHeight;
    int_fast16_t    iWidth = ptCopySize->iWidth;
    uint16x8_t      v256 = vdupq_n_u16(256);
    uint16x8_t      vStride4Offs = vidupq_n_u16(0, 4);
    uint16_t        c0, c1, c2;

    c0 = Colour & 0xff;
    c1 = (Colour >> 8) & 0xff;
    c2 = (Colour >> 16) & 0xff;

    for (int_fast16_t y = 0; y < iHeight; y++) {
        const uint8_t *pAlpha = (const uint8_t *)pwAlpha;
        uint8_t *       pTargetCh0 = (uint8_t*)pTarget;
        uint8_t *       pTargetCh1 = pTargetCh0 + 1;
        uint8_t *       pTargetCh2 = pTargetCh0 + 2;

#ifdef USE_MVE_INTRINSICS

        CCCN888_COLOUR_FILLING_MASK_INNER_MVE(CCCN888_TRGT_LOAD_STRIDE, vStride4Offs,
                                        CCCN888_SCAL_OPACITY_NONE, _, 4);
#else

        register unsigned blkCnt __asm("lr");
        blkCnt = iWidth;

    __asm volatile(
            ".p2align 2                                                   \n"
            /* expand chan0 */
            "   vldrb.u16               q0, [%[pTargetCh0], %[str4Offs]]  \n"
            "   vldrb.u16               q1, [%[pAlpha], %[str4Offs]]      \n"

            "   wlstp.16                lr, %[loopCnt], 1f                \n"
            "2:                                                           \n"

            "   vsub.i16                q2, %[vec256], q1                 \n"

            /*  scale ch0 vector with alpha vector */
            "   vmul.u16                q3, q0, q2                        \n"

            /* expand chan1 */
            "   vldrb.u16               q0, [%[pTargetCh1], %[str4Offs]]  \n"
            /*  blend ch0 vector with input ch0 color*/
            "   vmla.u16                q3, q1, %[c0]                     \n"
            "   vshr.u16                q3, q3, #8                        \n"

            "   vstrb.u16               q3, [%[pTargetCh0], %[str4Offs]]  \n"

            /*  scale ch1 vector with alpha vector */
            "   vmul.u16                q3, q0, q2                        \n"

            /* expand chan2 */
            "   vldrb.u16               q0, [%[pTargetCh2], %[str4Offs]]  \n"
            /*  blend ch1 vector with input ch1 color*/
            "   vmla.u16                q3, q1, %[c1]                     \n"
            "   vshr.u16                q3, q3, #8                        \n"
            "   vstrb.u16               q3, [%[pTargetCh1], %[str4Offs]]  \n"

            "   adds                    %[pAlpha], #32                    \n"
            "   adds                    %[pTargetCh0], #32                \n"


            /*  scale ch2 vector with alpha vector */
            "   vmul.u16                q3, q0, q2                        \n"
            "   vldrb.u16               q0, [%[pTargetCh0], %[str4Offs]]  \n"

            /*  blend ch2 vector with input ch2 color*/
            "   vmla.u16                q3, q1, %[c2]                     \n"
            "   vldrb.u16               q1, [%[pAlpha], %[str4Offs]]      \n"

            "   vshr.u16                q3, q3, #8                        \n"
            "   vstrb.u16               q3, [%[pTargetCh2], %[str4Offs]]  \n"

            "   adds                    %[pTargetCh1], #32                \n"
            "   adds                    %[pTargetCh2], #32                \n"

            "   letp                    lr, 2b                            \n"
            "1:                                                           \n"

            :[pTargetCh0] "+r"(pTargetCh0),  [pTargetCh1] "+r"(pTargetCh1),
             [pTargetCh2] "+r"(pTargetCh2), [pAlpha] "+r" (pAlpha), [loopCnt] "+r"(blkCnt)
            :[vec256] "t" (v256),[str4Offs] "t" (vStride4Offs),
             [c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2)
            :"q0", "q1", "q2", "q3", "memory");

#endif
        pwAlpha += (iAlphaStride);
        pTarget += (iTargetStride);
    }
}


__OVERRIDE_WEAK
void __arm_2d_impl_cccn888_colour_filling_channel_mask_opacity(uint32_t * __RESTRICT pTarget,
                                                               int16_t iTargetStride,
                                                               uint32_t * __RESTRICT pwAlpha,
                                                               int16_t iAlphaStride,
                                                               arm_2d_size_t *
                                                               __RESTRICT ptCopySize,
                                                               uint32_t Colour, uint8_t chOpacity)
{
    int_fast16_t    iHeight = ptCopySize->iHeight;
    int_fast16_t    iWidth = ptCopySize->iWidth;
    uint16x8_t      v256 = vdupq_n_u16(256);
    uint16x8_t      vStride4Offs = vidupq_n_u16(0, 4);
    uint8x16_t      vOpacity = vdupq_n_u8(chOpacity);
    uint16_t        c0, c1, c2;

    c0 = Colour & 0xff;
    c1 = (Colour >> 8) & 0xff;
    c2 = (Colour >> 16) & 0xff;

    for (int_fast16_t y = 0; y < iHeight; y++) {
        const uint8_t  *pAlpha = (const uint8_t *)pwAlpha;
        uint8_t *       pTargetCh0 = (uint8_t*)pTarget;
        uint8_t *       pTargetCh1 = pTargetCh0 + 1;
        uint8_t *       pTargetCh2 = pTargetCh0 + 2;

#ifdef USE_MVE_INTRINSICS

        CCCN888_COLOUR_FILLING_MASK_INNER_MVE(CCCN888_TRGT_LOAD_STRIDE, vStride4Offs,
                                        CCCN888_SCAL_OPACITY, vOpacity, 4);
#else

        register unsigned blkCnt __asm("lr");
        blkCnt = iWidth;

    __asm volatile(
            ".p2align 2                                                   \n"
            /* expand chan0 */
            "   vldrb.u16               q0, [%[pTargetCh0], %[str4Offs]]  \n"
            "   vldrb.u16               q1, [%[pAlpha], %[str4Offs]]      \n"
            "   vmulh.u8                q1, q1, %[vOpacity]               \n"

            "   wlstp.16                lr, %[loopCnt], 1f                \n"
            "2:                                                           \n"

            "   vsub.i16                q2, %[vec256], q1                 \n"

            /*  scale ch0 vector with alpha vector */
            "   vmul.u16                q3, q0, q2                        \n"

            /* expand chan1 */
            "   vldrb.u16               q0, [%[pTargetCh1], %[str4Offs]]  \n"
            /*  blend ch0 vector with input ch0 color*/
            "   vmla.u16                q3, q1, %[c0]                     \n"
            "   vshr.u16                q3, q3, #8                        \n"

            "   vstrb.u16               q3, [%[pTargetCh0], %[str4Offs]]  \n"

            /*  scale ch1 vector with alpha vector */
            "   vmul.u16                q3, q0, q2                        \n"

            /* expand chan2 */
            "   vldrb.u16               q0, [%[pTargetCh2], %[str4Offs]]  \n"
            /*  blend ch1 vector with input ch1 color*/
            "   vmla.u16                q3, q1, %[c1]                     \n"
            "   vshr.u16                q3, q3, #8                        \n"
            "   vstrb.u16               q3, [%[pTargetCh1], %[str4Offs]]  \n"

            "   adds                    %[pAlpha], #32                    \n"
            "   adds                    %[pTargetCh0], #32                \n"


            /*  scale ch2 vector with alpha vector */
            "   vmul.u16                q3, q0, q2                        \n"
            "   vldrb.u16               q0, [%[pTargetCh0], %[str4Offs]]  \n"

            /*  blend ch2 vector with input ch2 color*/
            "   vmla.u16                q3, q1, %[c2]                     \n"
            "   vldrb.u16               q1, [%[pAlpha], %[str4Offs]]      \n"
            "   vmulh.u8                q1, q1, %[vOpacity]               \n"

            "   vshr.u16                q3, q3, #8                        \n"
            "   vstrb.u16               q3, [%[pTargetCh2], %[str4Offs]]  \n"

            "   adds                    %[pTargetCh1], #32                \n"
            "   adds                    %[pTargetCh2], #32                \n"

            "   letp                    lr, 2b                            \n"
            "1:                                                           \n"

            :[pTargetCh0] "+r"(pTargetCh0),  [pTargetCh1] "+r"(pTargetCh1),
             [pTargetCh2] "+r"(pTargetCh2), [pAlpha] "+r" (pAlpha), [loopCnt] "+r"(blkCnt)
            :[vec256] "t" (v256),[str4Offs] "t" (vStride4Offs), [vOpacity] "t"(vOpacity),
             [c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2)
            :"q0", "q1", "q2", "q3", "memory");

#endif
        pwAlpha += (iAlphaStride);
        pTarget += (iTargetStride);
    }
}


/*----------------------------------------------------------------------------*
 * Convert Colour format                                                      *
 *----------------------------------------------------------------------------*/


__OVERRIDE_WEAK
void __arm_2d_impl_cccn888_to_rgb565(uint32_t *__RESTRICT pwSourceBase,
                                    int16_t iSourceStride,
                                    uint16_t *__RESTRICT phwTargetBase,
                                    int16_t iTargetStride,
                                    arm_2d_size_t *__RESTRICT ptCopySize)
{
    int32_t         blkCnt;
    uint32x4_t      maskR = vdupq_n_u32(0x001f);
    uint32x4_t      maskG = vdupq_n_u32(0x07e0);
    uint32x4_t      maskB = vdupq_n_u32(0xf800);

    for (int_fast16_t y = 0; y < ptCopySize->iHeight; y++) {
        const uint32_t *pSource = pwSourceBase;
        uint16_t       *pTarget = phwTargetBase;

        blkCnt = ptCopySize->iWidth;
#ifdef USE_MVE_INTRINSICS
        do {
            mve_pred16_t    tailPred = vctp32q(blkCnt);

            /* load a vector of 4 cccn888 pixels */
            uint32x4_t      vecIn = vld1q_z(pSource, tailPred);
            /* extract individual channels and place them according bit position */
            uint32x4_t      vecR = (vecIn >> 3) & maskR;
            uint32x4_t      vecG = (vecIn >> 5) & maskG;
            uint32x4_t      vecB = (vecIn >> 8) & maskB;
            /* merge */
            uint32x4_t      vOut = vecR | vecG | vecB;

            /* store a vector of 4 rgb565 pixels */
            vstrhq_p_u32(pTarget, vOut, tailPred);

            pSource += 4;
            pTarget += 4;
            blkCnt -= 4;
        }
        while (blkCnt > 0);
#else
    const int32_t inv_2pow3 = 1 << (31-3); /*1/2^3 in Q.31 */
    const int32_t inv_2pow5 = 1 << (31-5); /*1/2^5 in Q.31 */
    const int32_t inv_2pow8 = 1 << (31-8); /*1/2^8 in Q.31 */

    __asm volatile(
        "   wlstp.32                lr, %[loopCnt], 1f                  \n"
        /* precompute for allowing filling stalls in the inner loop  */
        /* use vqdmulh to replace shifts to allow overlap with 'AND' */

        /* load a vector of 4 cccn888 pixels */
        "   vldrw.u32               q0, [%[pSource]], #16               \n"
        /* mimic right shift by 3 */
        "   vqdmulh.s32             q1, q0, %[inv_2pow3]                \n"

        ".p2align 2                                                     \n"
        "2:                                                             \n"
        "   vand                    q1, q1, %[maskR]                    \n"
        /* mimic right shift by 5 */
        "   vqdmulh.s32             q2, q0, %[inv_2pow5]                \n"
        "   vand                    q2, q2, %[maskG]                    \n"
        /* mimic right shift by 8 */
        "   vqdmulh.s32             q3, q0, %[inv_2pow8]                \n"
        /* accumulate R & G */
        "   vorr                    q2, q1, q2                          \n"
        /* load next vector of 4 cccn888 pixels */
        "   vldrw.u32               q0, [%[pSource]], #16               \n"
        "   vand                    q3, q3, %[maskB]                    \n"
        /* mimic right shift by 3 */
        "   vqdmulh.s32             q1, q0, %[inv_2pow3]                \n"
        /* accumulate B */
        "   vorr                    q2, q2, q3                          \n"
        /* store a vector of 4 rgb565 pixels */
        "   vstrh.32                q2, [%[pTarget]], #8                \n"
        "   letp                    lr, 2b                              \n"
        "1:                                                             \n"

        : [pSource] "+r"(pSource), [pTarget] "+r" (pTarget)
        : [loopCnt] "r"(blkCnt), [inv_2pow3] "r" (inv_2pow3),
          [inv_2pow5] "r" (inv_2pow5),  [inv_2pow8] "r" (inv_2pow8),
          [maskR] "t" (maskR),[maskG] "t" (maskG),[maskB] "t" (maskB)
        : "q0", "q1", "q2", "q3", "memory", "r14" );
#endif

        pwSourceBase += iSourceStride;
        phwTargetBase += iTargetStride;
    }
}


__OVERRIDE_WEAK
void __arm_2d_impl_rgb565_to_cccn888(uint16_t *__RESTRICT phwSourceBase,
                                    int16_t iSourceStride,
                                    uint32_t *__RESTRICT pwTargetBase,
                                    int16_t iTargetStride,
                                    arm_2d_size_t *__RESTRICT ptCopySize)
{
    int32_t         blkCnt;
    uint32x4_t      maskRB = vdupq_n_u32(0xf8);
    uint32x4_t      maskG = vdupq_n_u32(0xfc00);


    for (int_fast16_t y = 0; y < ptCopySize->iHeight; y++) {

        const uint16_t *__RESTRICT phwSource = phwSourceBase;
        uint32_t       *__RESTRICT pwTarget = pwTargetBase;

        blkCnt = ptCopySize->iWidth;
#ifdef USE_MVE_INTRINSICS
        do {
            mve_pred16_t    tailPred = vctp32q(blkCnt);

            /* load a vector of 4 rgb565 pixels */
            uint32x4_t      vecIn = vldrhq_z_u32(phwSource, tailPred);
            /* extract individual channels and place them according position */
            uint32x4_t      vecR = (vecIn << 3) & maskRB;
            uint32x4_t      vecG = (vecIn << 5) & maskG;
            uint32x4_t      vecB = ((vecIn >> 8) & maskRB) << 16;
            /* merge and set n channel to 0xff */
            uint32x4_t      vOut = 0xff000000 | vecR | vecG | vecB;

            /* store a vector of 4 cccn888 pixels */
            vst1q_p(pwTarget, vOut, tailPred);

            phwSource += 4;
            pwTarget += 4;
            blkCnt -= 4;
        }
        while (blkCnt > 0);

#else
    __asm volatile(
        "   wlstp.32                lr, %[loopCnt], 1f                  \n"
        /* precompute for allowing filling stalls in the inner loop         */
        /* use vqdmulh & vmul to replace shifts to allow overlap with 'AND' */

        /* load a vector of 4 rgb565 pixels */
        "   vldrh.u32               q0, [%[pSource]], #8                \n"
        /* mimic left shift by 3 */
        "   vmul.u32                q1, q0, %[two_pow3]                 \n"
        ".p2align 2                                                     \n"
        "2:                                                             \n"
        /* mimic left shift by 5 */
        "   vmul.u32                q2, q0, %[two_pow5]                 \n"
        "   vand                    q1, q1, %[maskRB]                   \n"
        /* mimic right shift by 8 */
        "   vqdmulh.s32             q3, q0, %[inv_2pow8]                \n"
        "   vand                    q2, q2, %[maskG]                    \n"
        /* accumulate G & R, use vmla instead of vorr for best overlap */
        "   vmla.u32                q2, q1, %[one]                      \n"
        "   vand                    q3, q3, %[maskRB]                   \n"
        /* accumulate B + left shift by 16 */
        "   vmla.u32                q2, q3, %[two_pow16]                \n"
        /* load next vector of 4 rgb565 pixels */
        "   vldrh.u32               q0, [%[pSource]], #8                \n"
        /* merge and set n channel to 0xff */
        "   vorr.i32                q2, #0xff000000                     \n"
        /* mimic left shift by 3 */
        "   vmul.u32                q1, q0, %[two_pow3]                 \n"
        /* store a vector of 4 cccn888 pixels */
        "   vstrw.32                q2, [%[pTarget]], #16               \n"
        "   letp                    lr, 2b                              \n"
        "1:                                                             \n"

        : [pSource] "+r"(phwSource), [pTarget] "+r" (pwTarget)
        : [loopCnt] "r"(blkCnt),[two_pow3] "r" (1<<3), [two_pow5] "r" (1<<5),
          [two_pow16] "r" (1<<16),[inv_2pow8] "r" (1 << (31-8)),
          [maskRB] "t" (maskRB),[maskG] "t" (maskG), [one] "r" (1)
        : "q0", "q1", "q2", "q3", "memory", "r14" );
#endif

        phwSourceBase += iSourceStride;
        pwTargetBase += iTargetStride;
    }
}


void __arm_2d_impl_rgb565_masks_fill(uint16_t * __RESTRICT ptSourceBase,
                                     int16_t iSourceStride,
                                     arm_2d_size_t * __RESTRICT ptSourceSize,
                                     uint8_t * __RESTRICT pchSourceMaskBase,
                                     int16_t iSourceMaskStride,
                                     arm_2d_size_t * __RESTRICT ptSourceMaskSize,
                                     uint16_t * __RESTRICT ptTargetBase,
                                     int16_t iTargetStride,
                                     arm_2d_size_t * __RESTRICT ptTargetSize,
                                     uint8_t * __RESTRICT pchTargetMaskBase,
                                     int16_t iTargetMaskStride,
                                     arm_2d_size_t * __RESTRICT ptTargetMaskSize)
{
    uint8_t        *__RESTRICT pchTargetMaskLineBase = pchTargetMaskBase;
    uint16x8_t      v256 = vdupq_n_u16(256);

#ifndef USE_MVE_INTRINSICS
    uint16x8_t      scratch[5];

    /* vector of 256 avoiding use of vdup to increase overlap efficiency*/
    vst1q((uint16_t *) & scratch[0], v256);
    /* scratch[1] is temporary for blended Red chan. vector */

    /* Unpacking Mask Red */
    vst1q((uint16_t *) & scratch[2], vdupq_n_u16(0x00fc));
    /* B channel packing mask */
    vst1q((uint16_t *) & scratch[3], vdupq_n_u16(0xf800));
    /* G channel packing Mask */
    vst1q((uint16_t *) & scratch[4], vdupq_n_u16(0x07e0));

    /* use of fixed point mult instead of vshr to increase overlap efficiency */
    const int16_t   inv_2pow3 = 1 << (15 - 3);  /* 1/(2^3) in Q.15 */
#endif

    for (int_fast16_t iTargetY = 0; iTargetY < ptTargetSize->iHeight;) {
        uint16_t       *__RESTRICT ptSource = ptSourceBase;
        uint8_t        *pchSourceMask = pchSourceMaskBase;
    #if __API_CAFWM_CFG_SUPPORT_SRC_MSK_WRAPING
        int_fast16_t    iSourceMaskY = 0;
    #endif

        for (int_fast16_t iSourceY = 0; iSourceY < ptSourceSize->iHeight; iSourceY++) {
            uint16_t       *__RESTRICT ptTarget = ptTargetBase;
            uint8_t        *__RESTRICT pchTargetMask = pchTargetMaskLineBase;
            uint_fast32_t   wLengthLeft = ptTargetSize->iWidth;

            do {
                uint_fast32_t   wLength = MIN(wLengthLeft, ptSourceSize->iWidth);
                uint16_t       *__RESTRICT ptSrc = ptSource;
                uint8_t        *__RESTRICT pchSrcMsk = pchSourceMask;
                uint16_t       *__RESTRICT ptTargetCur = ptTarget;
                uint8_t        *__RESTRICT pchTargetMaskCur = pchTargetMask;

#ifdef USE_MVE_INTRINSICS
                int32_t         blkCnt = wLength;

                do {
                    uint16x8_t      vecTarget = vld1q(ptTargetCur);
                    uint16x8_t      vecSource = vld1q(ptSrc);
                    uint16x8_t      vecSrcMsk = vldrbq_u16(pchSrcMsk);
                    uint16x8_t      vecTargetMask = vldrbq_u16(pchTargetMaskCur);
                    uint16x8_t      vecHwOpacity =
                        vsubq_u16(v256, (vecSrcMsk * vecTargetMask) >> 8);

                    vecTarget = __arm_2d_rgb565_blending_opacity_single_vec(
                                                    vecTarget, vecSource, vecHwOpacity);
                    /* tail predication */
                    vst1q_p_u16(ptTargetCur, vecTarget, vctp16q(blkCnt));

                    pchSrcMsk += 8;
                    pchTargetMaskCur += 8;
                    ptTargetCur += 8;
                    ptSrc += 8;

                    blkCnt -= 8;
                }
                while (blkCnt > 0);

#else
                register unsigned blkCnt __asm("lr");
                blkCnt = wLength;

                __asm volatile  (
                    /* R & B mask */
                    "vecRBUnpackMask    .req q7                               \n"
                    "vecAlpha           .req q5                               \n"
                    "vecHwOpacity       .req q3                               \n"

                    /* preload */
                    "   vldrb.u16           q0, [%[pchSrcMsk]], #8            \n"
                    "   vmov.i16            vecRBUnpackMask, #0x00f8          \n"
                    "   vldrb.u16           q5, [%[pchTargetMask]], #8        \n"

                    ".p2align 2                                               \n"
                    "   wlstp.16            lr, %[loopCnt], 1f                \n"
                    "2:                                                       \n"
                    /* vecSrcMsk * vecTargetMask */
                    "   vmul.i16            q0, q5, q0                        \n"
                    "   vldrh.u16           q6, [%[ptTarget]]                 \n"
                    "   vshr.u16            vecAlpha, q0, #8                  \n"
                    /* 256-dup vector */
                    "   vldrh.u16           q1, [%[scratch], #(16*0)]         \n"
                    /* vecHwOpacity =
                          vsubq_u16(v256, (vecSrcMsk * vecTargetMask) >> 8) */
                    "   vsub.i16            vecHwOpacity, q1, vecAlpha        \n"
                    "   vldrh.u16           q1, [%[ptSrc]], #16               \n"
                    /* mimic vshl #3 */
                    "   vshl.u16            q0, q6, #3                        \n"
                    "   vmul.i16            q4, q1, %[eight]                  \n"
                    /* vecR extract and scale */
                    "   vand                q0, q0, vecRBUnpackMask           \n"
                    "   vmul.i16            q0, vecHwOpacity, q0              \n"
                    /* vecSrcR extract and scale */
                    "   vand                q4, q4, vecRBUnpackMask           \n"
                    "   vmul.i16            q4, vecAlpha, q4                  \n"
                    /* 0xfc G-mask */
                    "   vldrw.u32           q2, [%[scratch], #(16*2)]         \n"
                    "   vadd.i16            q4, q0, q4                        \n"
                    /* push blended R  */
                    "   vstrw.32            q4, [%[scratch], #(16*1)]         \n"
                    /*  mimic vshr.u16            q4, q6, #3 */
                    "   vqdmulh.s16         q4, q6, %[inv_2pow3]              \n"
                    "   vshr.u16            q0, q1, #3                        \n"
                    /* vecG extract and scale */
                    "   vand                q4, q4, q2                        \n"
                    "   vmul.i16            q4, vecHwOpacity, q4              \n"
                    /* vecSrcG extract and scale */
                    "   vand                q0, q0, q2                        \n"
                    "   vmul.i16            q2, vecAlpha, q0                  \n"
                    "   vshr.u16            q0, q1, #8                        \n"
                    /* blended G */
                    /* vadd.i16            q2, q4, q2
                       addition using vmla for more efficient overlap */
                    "   vmla.u16            q2, q4, %[one]                    \n"
                    /* vecB extract and scale */
                    "   vshr.u16            q4, q6, #8                        \n"
                    "   vand                q4, q4, vecRBUnpackMask           \n"
                    "   vmul.i16            q4, vecHwOpacity, q4              \n"
                    /* vecSrcB extract and scale */
                    "   vand                q0, q0, vecRBUnpackMask           \n"
                    "   vmul.i16            q0, vecAlpha, q0                  \n"

                    ".unreq vecAlpha                                          \n"
                    ".unreq vecHwOpacity                                      \n"
                    ".unreq vecRBUnpackMask                                   \n"

                    /* reload blended R */
                    "   vldrw.u32           q1, [%[scratch], #(16*1)]         \n"
                    /* blended B
                       vadd.i16            q0, q4, q0
                       addition using vmla for more efficient overlap */
                    "   vmla.u16            q0, q4, %[one]                    \n"
                    /* pack R */
                    "   vshr.u16            q3, q1, #11                       \n"
                    /* B channel packing mask 0xf800 */
                    "   vldrw.u32           q4, [%[scratch], #(16*3)]         \n"
                    "   vand                q0, q0, q4                        \n"
                    /* accumulate R & B */
                    "   vorr                q4, q0, q3                        \n"
                    /* G channel packing mask 0x07e0 */
                    "   vldrw.u32           q3, [%[scratch], #(16*4)]         \n"
                    "   vshr.u16            q2, q2, #5                        \n"
                    /* load next source mask */
                    "   vldrb.u16           q0, [%[pchSrcMsk]], #8            \n"
                    /* G channel masking */
                    "   vand                q2, q2, q3                        \n"
                    /* load next target mask */
                    "   vldrb.u16           q5, [%[pchTargetMask]], #8        \n"
                    /* pack G with R.B */
                    "   vorr                q4, q4, q2                        \n"
                    "   vstrh.16            q4, [%[ptTarget]], #16            \n"
                    "   letp                lr, 2b                            \n"
                    "1:                                                       \n"
                    :[ptTarget] "+r"(ptTargetCur),[ptSrc] "+r"(ptSrc),
                     [pchTargetMask] "+l"(pchTargetMaskCur),[pchSrcMsk] "+l"(pchSrcMsk),
                     [loopCnt] "+r"(blkCnt)
                    :[scratch] "r"  (scratch),[eight] "r"(8),[inv_2pow3] "r"(inv_2pow3),
                     [one] "r" (1)
                    :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory");

#endif
                ptTarget += wLength;
                pchTargetMask += wLength;

                wLengthLeft -= wLength;
            } while (wLengthLeft);

            ptSource += iSourceStride;
            ptTargetBase += iTargetStride;
        #if __API_CAFWM_CFG_SUPPORT_SRC_MSK_WRAPING
            iSourceMaskY++;
            //! handle source mask
            if (    (iSourceMaskY >= ptSourceMaskSize->iHeight)
               ||   (iSourceMaskY >= ptSourceSize->iHeight)) {
                pchSourceMask = pchSourceMaskBase;
                iSourceMaskY = 0;
            } else {
                pchSourceMask += iSourceMaskStride;
            }
        #else
            pchSourceMask += iSourceMaskStride;
        #endif

            pchTargetMaskLineBase += iTargetMaskStride;

            iTargetY++;
            if (iTargetY >= ptTargetSize->iHeight) {
                break;
            }
        }
    }
}


__OVERRIDE_WEAK
void __arm_2d_impl_rgb565_src_msk_1h_des_msk_fill(
                                    uint16_t * __RESTRICT ptSourceBase,
                                    int16_t iSourceStride,
                                    arm_2d_size_t * __RESTRICT ptSourceSize,
                                    uint8_t * __RESTRICT pchSourceMaskBase,
                                    int16_t iSourceMaskStride,
                                    arm_2d_size_t * __RESTRICT ptSourceMaskSize,
                                    uint16_t * __RESTRICT ptTargetBase,
                                    int16_t iTargetStride,
                                    arm_2d_size_t * __RESTRICT ptTargetSize,
                                    uint8_t * __RESTRICT pchTargetMaskBase,
                                    int16_t iTargetMaskStride,
                                    arm_2d_size_t * __RESTRICT ptTargetMaskSize)
{
    uint8_t        *__RESTRICT pchTargetMaskLineBase = pchTargetMaskBase;
    uint16x8_t      v256 = vdupq_n_u16(256);

#ifndef USE_MVE_INTRINSICS
    uint16x8_t      scratch[5];

    /* vector of 256 avoiding use of vdup to increase overlap efficiency*/
    vst1q((uint16_t *) & scratch[0], v256);
    /* scratch[1] is temporary for blended Red chan. vector */

    /* Unpacking Mask Red */
    vst1q((uint16_t *) & scratch[2], vdupq_n_u16(0x00fc));
    /* B channel packing mask */
    vst1q((uint16_t *) & scratch[3], vdupq_n_u16(0xf800));
    /* G channel packing Mask */
    vst1q((uint16_t *) & scratch[4], vdupq_n_u16(0x07e0));

    /* use of fixed point mult instead of vshr to increase overlap efficiency */
    const int16_t   inv_2pow3 = 1 << (15 - 3);  /* 1/(2^3) in Q.15 */
#endif

    for (int_fast16_t iTargetY = 0; iTargetY < ptTargetSize->iHeight;) {
        uint16_t       *__RESTRICT ptSource = ptSourceBase;
        uint8_t        *pchSourceMask = pchSourceMaskBase;
    #if __API_CAFWM_CFG_SUPPORT_SRC_MSK_WRAPING
        int_fast16_t    iSourceMaskY = 0;
    #endif

        for (int_fast16_t iSourceY = 0; iSourceY < ptSourceSize->iHeight; iSourceY++) {
            uint16_t       *__RESTRICT ptTarget = ptTargetBase;
            uint8_t        *__RESTRICT pchTargetMask = pchTargetMaskLineBase;
            uint_fast32_t   wLengthLeft = ptTargetSize->iWidth;

            do {
                uint_fast32_t   wLength = MIN(wLengthLeft, ptSourceSize->iWidth);
                uint16_t       *__RESTRICT ptSrc = ptSource;
                uint8_t        *__RESTRICT pchSrcMsk = pchSourceMask;
                uint16_t       *__RESTRICT ptTargetCur = ptTarget;
                uint8_t        *__RESTRICT pchTargetMaskCur = pchTargetMask;

#ifdef USE_MVE_INTRINSICS
                int32_t         blkCnt = wLength;

                do {
                    uint16x8_t      vecTarget = vld1q(ptTargetCur);
                    uint16x8_t      vecSource = vld1q(ptSrc);
                    uint16x8_t      vecSrcMsk = vldrbq_u16(pchSrcMsk);
                    uint16x8_t      vecTargetMask = vldrbq_u16(pchTargetMaskCur);
                    uint16x8_t      vecHwOpacity =
                        vsubq_u16(v256, (vecSrcMsk * vecTargetMask) >> 8);

                    vecTarget = __arm_2d_rgb565_blending_opacity_single_vec(
                                                    vecTarget, vecSource, vecHwOpacity);
                    /* tail predication */
                    vst1q_p_u16(ptTargetCur, vecTarget, vctp16q(blkCnt));

                    pchSrcMsk += 8;
                    pchTargetMaskCur += 8;
                    ptTargetCur += 8;
                    ptSrc += 8;

                    blkCnt -= 8;
                }
                while (blkCnt > 0);

#else
                register unsigned blkCnt __asm("lr");
                blkCnt = wLength;

                __asm volatile  (
                    /* R & B mask */
                    "vecRBUnpackMask    .req q7                               \n"
                    "vecAlpha           .req q5                               \n"
                    "vecHwOpacity       .req q3                               \n"

                    /* preload */
                    "   vldrb.u16           q0, [%[pchSrcMsk]], #8            \n"
                    "   vmov.i16            vecRBUnpackMask, #0x00f8          \n"
                    "   vldrb.u16           q5, [%[pchTargetMask]], #8        \n"

                    ".p2align 2                                               \n"
                    "   wlstp.16            lr, %[loopCnt], 1f                \n"
                    "2:                                                       \n"
                    /* vecSrcMsk * vecTargetMask */
                    "   vmul.i16            q0, q5, q0                        \n"
                    "   vldrh.u16           q6, [%[ptTarget]]                 \n"
                    "   vshr.u16            vecAlpha, q0, #8                  \n"
                    /* 256-dup vector */
                    "   vldrh.u16           q1, [%[scratch], #(16*0)]         \n"
                    /* vecHwOpacity =
                          vsubq_u16(v256, (vecSrcMsk * vecTargetMask) >> 8) */
                    "   vsub.i16            vecHwOpacity, q1, vecAlpha        \n"
                    "   vldrh.u16           q1, [%[ptSrc]], #16               \n"
                    /* mimic vshl #3 */
                    "   vshl.u16            q0, q6, #3                        \n"
                    "   vmul.i16            q4, q1, %[eight]                  \n"
                    /* vecR extract and scale */
                    "   vand                q0, q0, vecRBUnpackMask           \n"
                    "   vmul.i16            q0, vecHwOpacity, q0              \n"
                    /* vecSrcR extract and scale */
                    "   vand                q4, q4, vecRBUnpackMask           \n"
                    "   vmul.i16            q4, vecAlpha, q4                  \n"
                    /* 0xfc G-mask */
                    "   vldrw.u32           q2, [%[scratch], #(16*2)]         \n"
                    "   vadd.i16            q4, q0, q4                        \n"
                    /* push blended R  */
                    "   vstrw.32            q4, [%[scratch], #(16*1)]         \n"
                    /*  mimic vshr.u16            q4, q6, #3 */
                    "   vqdmulh.s16         q4, q6, %[inv_2pow3]              \n"
                    "   vshr.u16            q0, q1, #3                        \n"
                    /* vecG extract and scale */
                    "   vand                q4, q4, q2                        \n"
                    "   vmul.i16            q4, vecHwOpacity, q4              \n"
                    /* vecSrcG extract and scale */
                    "   vand                q0, q0, q2                        \n"
                    "   vmul.i16            q2, vecAlpha, q0                  \n"
                    "   vshr.u16            q0, q1, #8                        \n"
                    /* blended G */
                    /* vadd.i16            q2, q4, q2
                       addition using vmla for more efficient overlap */
                    "   vmla.u16            q2, q4, %[one]                    \n"
                    /* vecB extract and scale */
                    "   vshr.u16            q4, q6, #8                        \n"
                    "   vand                q4, q4, vecRBUnpackMask           \n"
                    "   vmul.i16            q4, vecHwOpacity, q4              \n"
                    /* vecSrcB extract and scale */
                    "   vand                q0, q0, vecRBUnpackMask           \n"
                    "   vmul.i16            q0, vecAlpha, q0                  \n"

                    ".unreq vecAlpha                                          \n"
                    ".unreq vecHwOpacity                                      \n"
                    ".unreq vecRBUnpackMask                                   \n"

                    /* reload blended R */
                    "   vldrw.u32           q1, [%[scratch], #(16*1)]         \n"
                    /* blended B
                       vadd.i16            q0, q4, q0
                       addition using vmla for more efficient overlap */
                    "   vmla.u16            q0, q4, %[one]                    \n"
                    /* pack R */
                    "   vshr.u16            q3, q1, #11                       \n"
                    /* B channel packing mask 0xf800 */
                    "   vldrw.u32           q4, [%[scratch], #(16*3)]         \n"
                    "   vand                q0, q0, q4                        \n"
                    /* accumulate R & B */
                    "   vorr                q4, q0, q3                        \n"
                    /* G channel packing mask 0x07e0 */
                    "   vldrw.u32           q3, [%[scratch], #(16*4)]         \n"
                    "   vshr.u16            q2, q2, #5                        \n"
                    /* load next source mask */
                    "   vldrb.u16           q0, [%[pchSrcMsk]], #8            \n"
                    /* G channel masking */
                    "   vand                q2, q2, q3                        \n"
                    /* load next target mask */
                    "   vldrb.u16           q5, [%[pchTargetMask]], #8        \n"
                    /* pack G with R.B */
                    "   vorr                q4, q4, q2                        \n"
                    "   vstrh.16            q4, [%[ptTarget]], #16            \n"
                    "   letp                lr, 2b                            \n"
                    "1:                                                       \n"
                    :[ptTarget] "+r"(ptTargetCur),[ptSrc] "+r"(ptSrc),
                     [pchTargetMask] "+l"(pchTargetMaskCur),[pchSrcMsk] "+l"(pchSrcMsk),
                     [loopCnt] "+r"(blkCnt)
                    :[scratch] "r"  (scratch),[eight] "r"(8),[inv_2pow3] "r"(inv_2pow3),
                     [one] "r" (1)
                    :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory");

#endif
                ptTarget += wLength;
                pchTargetMask += wLength;

                wLengthLeft -= wLength;
            } while (wLengthLeft);

            ptSource += iSourceStride;
            ptTargetBase += iTargetStride;
        #if __API_CAFWM_CFG_SUPPORT_SRC_MSK_WRAPING
            iSourceMaskY++;
            //! handle source mask
            if (    (iSourceMaskY >= ptSourceMaskSize->iHeight)
               ||   (iSourceMaskY >= ptSourceSize->iHeight)) {
                pchSourceMask = pchSourceMaskBase;
                iSourceMaskY = 0;
            } else {
                pchSourceMask += iSourceMaskStride;
            }
        #else
            pchSourceMask += iSourceMaskStride;
        #endif

            pchTargetMaskLineBase = pchTargetMaskBase;

            iTargetY++;
            if (iTargetY >= ptTargetSize->iHeight) {
                break;
            }
        }
    }
}


#if defined(__clang__)
#   pragma clang diagnostic pop
#endif

#ifdef   __cplusplus
}
#endif

#endif // __ARM_2D_HAS_HELIUM__