/*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/* ----------------------------------------------------------------------
 * Project:      Arm-2D Library
 * Title:        arm-2d_paving_helium.h
 * Description:  Provides definitions and code templates for generic paving
 *
 * $Date:        17. Sep 2021
 * $Revision:    V 0.6.0
 *
 * Target Processor:  Cortex-M cores
 *
 * -------------------------------------------------------------------- */


#ifndef __ARM_2D_PAVING_HELIUM_H__
#define __ARM_2D_PAVING_HELIUM_H__

/*============================ INCLUDES ======================================*/
#include "arm_2d.h"
#include "__arm_2d_paving.h"

#ifdef   __cplusplus
extern "C" {
#endif

/*============================ MACROS ========================================*/
/*============================ MACROFIED FUNCTIONS ===========================*/


#undef LOAD_SRC_DIRECT_16
#undef LOAD_SRC_DIRECT_32
#undef LOAD_SRC_X_MIRROR_16
#undef LOAD_SRC_X_MIRROR_32
#undef SETUP_MIRROR_COPY_16
#undef SETUP_MIRROR_COPY_32
#undef __ARM_2D_PAVING_2x2
#undef __ARM_2D_PAVING_1x2
#undef __ARM_2D_PAVING_2x1
#undef __ARM_2D_PAVING_1x1


/* Set predicate to true if vector different than color mask */
#define CMP_CL_MSK(sz, colour)   p = vcmpneq_m(in, colour, p)


#undef LOAD_SRC_DIRECT_8
#undef LOAD_SRC_DIRECT_16
#undef LOAD_SRC_DIRECT_32
#undef LOAD_SRC_X_MIRROR_8
#undef LOAD_SRC_X_MIRROR_16
#undef LOAD_SRC_X_MIRROR_32
#undef SETUP_MIRROR_COPY_8
#undef SETUP_MIRROR_COPY_16
#undef SETUP_MIRROR_COPY_32


#define LOAD_SRC_DIRECT_8(pSource, offset)                                      \
            vld1q_z(pSource, p);                                                \
            pSource += 16;

#define LOAD_SRC_DIRECT_16(pSource, offset)                                     \
            vld1q_z(pSource, p);                                                \
            pSource += 8;

#define LOAD_SRC_DIRECT_32(pSource, offset)                                     \
            vld1q_z(pSource, p);                                                \
            pSource += 4;

#define LOAD_SRC_X_MIRROR_8(pSource, offset)                                    \
            vldrbq_gather_offset_z(pSource, offset, p);                         \
            offset = vddupq_x_wb_u8(&curOffsetIdx, 1, p);

#define LOAD_SRC_X_MIRROR_16(pSource, offset)                                   \
            vldrhq_gather_shifted_offset_z(pSource, offset, p);                 \
            offset = vddupq_x_wb_u16(&curOffsetIdx, 1, p);

#define LOAD_SRC_X_MIRROR_32(pSource, offset)                                   \
            vldrwq_gather_shifted_offset_z(pSource, offset, p);                 \
            offset = vddupq_x_wb_u32(&curOffsetIdx, 1, p);

/* prepare Helium gather load offset */
#define SETUP_MIRROR_COPY_8(srcWidth)                                           \
            uint32_t curOffsetIdx = srcWidth - 1;                               \
            uint8x16_t offset = vddupq_wb_u8(&curOffsetIdx, 1);

#define SETUP_MIRROR_COPY_16(srcWidth)                                          \
            uint32_t curOffsetIdx = srcWidth - 1;                               \
            uint16x8_t offset = vddupq_wb_u16(&curOffsetIdx, 1);

#define SETUP_MIRROR_COPY_32(srcWidth)                                          \
            uint32_t curOffsetIdx = srcWidth - 1;                               \
            uint32x4_t offset = vddupq_wb_u32(&curOffsetIdx, 1);


/* Macros intercepting X or XY mirroring */
#define IS_PAVING_DIRECT_LOAD_PATTERN(sz)            (0)
#define IS_PAVING_X_MIRROR_LOAD_PATTERN(sz)          ((sz == 8)? 1: 0)
#define IS_PAVING_Y_MIRROR_LOAD_PATTERN(sz)          (0)
#define IS_PAVING_XY_MIRROR_LOAD_PATTERN(sz)         ((sz == 8)? 1: 0)


/* handle c8bit X or XY 2x2 mirroring when tile width > 256 */
/* uses 8-bit widened load allowing width up to 65K */
#define __ARM_2D_PAVING_2x2_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR,                \
                                                      SETUP_MIRROR, LOAD, ...)  \
{                                                                               \
    /* row iteration */                                                         \
    /* handle pair of source image row and fill in the column direction */      \
    do {                                                                        \
        uint32_t        rowCnt = 0;                                             \
                                                                                \
        pSourceBaseCur = pSourceBase;                                           \
                                                                                \
        /* single source row loop */                                            \
        do {                                                                    \
            uint8_t            *pDst = (uint8_t*)pTargetBaseCur;                \
            uint32_t            tilePairColsCnt = tilePairCols;                 \
                                                                                \
            /* column loop */                                                   \
            /* duplicate current source row into 2 x 2 destination across */    \
            /*destination columns */                                            \
            /*                                                                  \
             *  +-------+                                                       \
             *  |xxxxxxx|                                                       \
             *  |  src  |                                                       \
             *  +-------+                                                       \
             *                                                                  \
             *            <--------------->                                     \
             *                                                                  \
             *           ||       |       ||       |       ||                   \
             *       *   +========+=======+========+=======+                    \
             *       |   ||xxxxxxx|xxxxxxx||xxxxxxx|xxxxxxx||...                \
             *       |   ||       |       ||       |       ||...                \
             *       |   +--------+-------+--------+-------+-...                \
             *       |   ||xxxxxxx|xxxxxxx||xxxxxxx|xxxxxxx||...                \
             *       |   ||       |       ||       |       ||...                \
             *       *   +========+=======+========+=======+                    \
             *           ||       |       ||       |       ||                   \
             */                                                                 \
            do {                                                                \
                int32_t              dstColCnt = srcWidth;                      \
                const uint8_t       *pSource = (const uint8_t*)pSourceBaseCur   \
                                                              + SRC_OFFSET;     \
                uint8_t             *pDst00 = (uint8_t*)pDst;                   \
                uint8_t             *pDst01 = (uint8_t*)pDst00 + srcWidth;      \
                uint8_t             *pDst10 = (uint8_t*)pDst + srcHeight *      \
                                                                 iTargetStride; \
                uint8_t             *pDst11 = (uint8_t*)pDst10 + srcWidth;      \
                                                                                \
                uint32_t            curOffsetIdx = srcWidth - 1;                \
                uint16x8_t          offset = vddupq_wb_u16(&curOffsetIdx, 1);   \
                                                                                \
                /* duplicate current source line into 2x2 destinations */       \
                do {                                                            \
                    mve_pred16_t    p = vctp16q(dstColCnt);                     \
                    uint8x16_t      in;                                         \
                    in = vldrbq_gather_offset_z_u16(pSource, offset, p);        \
                    offset = vddupq_x_wb_u16(&curOffsetIdx, 1, p);              \
                                                                                \
                    /* placeholder for color masking */                         \
                    __VA_ARGS__;                                                \
                                                                                \
                    vstrbq_p_u16(pDst00, in, p);                                \
                    vstrbq_p_u16(pDst01, in, p);                                \
                    vstrbq_p_u16(pDst10, in, p);                                \
                    vstrbq_p_u16(pDst11, in, p);                                \
                                                                                \
                    pDst00 += 8;                                                \
                    pDst01 += 8;                                                \
                    pDst10 += 8;                                                \
                    pDst11 += 8;                                                \
                    dstColCnt -= 8;                                             \
                }                                                               \
                while (dstColCnt > 0);                                          \
                                                                                \
                pDst += 2 * srcWidth;                                           \
                tilePairColsCnt--;                                              \
            }                                                                   \
            while (tilePairColsCnt != 0);                                       \
                                                                                \
            rowCnt ++;                                                          \
            pTargetBaseCur += iTargetStride;                                    \
            pSourceBaseCur += (iSourceStride * DIR);                            \
        }                                                                       \
        while (rowCnt < ptSourceSize->iHeight);                                 \
                                                                                \
        pTargetBaseCur += srcHeight * iTargetStride;                            \
        tilePairRows--;                                                         \
    }                                                                           \
    while (tilePairRows != 0);                                                  \
}


#define __ARM_2D_PAVING_2x2(sz, SRC_OFFSET, DIR, SETUP_MIRROR, LOAD, ...)       \
{                                                                               \
    uint32_t            srcWidth = ptSourceSize->iWidth;                        \
    uint32_t            srcHeight = ptSourceSize->iHeight;                      \
    const ARM_PIX_SCLTYP(sz) *pSourceBaseCur;                                   \
    ARM_PIX_SCLTYP(sz) *pTargetBaseCur = pTargetBase;                           \
                                                                                \
    if ((sz == 8) && (srcWidth >= 256) && IS_##LOAD) {                          \
        /* special case for 8-bit and X & XY mirror  */                         \
        /* width does not fit in 8-bit, need widening */                        \
        /* will be optimized away for all other cases */                        \
        /* not executed unconditionally as slower */                            \
        __ARM_2D_PAVING_2x2_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR,                \
                            SETUP_MIRROR, LOAD, __VA_ARGS__)                    \
    } else                                                                      \
    /* row iteration */                                                         \
    /* handle pair of source image row and fill in the column direction */      \
    do {                                                                        \
        uint32_t        rowCnt = 0;                                             \
                                                                                \
        pSourceBaseCur = pSourceBase;                                           \
                                                                                \
        /* single source row loop */                                            \
        do {                                                                    \
            ARM_PIX_SCLTYP(sz) *pDst = pTargetBaseCur;                          \
            uint32_t            tilePairColsCnt = tilePairCols;                 \
                                                                                \
            /* column loop */                                                   \
            /* duplicate current source row into 2 x 2 destination across */    \
            /*destination columns */                                            \
            /*                                                                  \
             *  +-------+                                                       \
             *  |xxxxxxx|                                                       \
             *  |  src  |                                                       \
             *  +-------+                                                       \
             *                                                                  \
             *            <--------------->                                     \
             *                                                                  \
             *           ||       |       ||       |       ||                   \
             *       *   +========+=======+========+=======+                    \
             *       |   ||xxxxxxx|xxxxxxx||xxxxxxx|xxxxxxx||...                \
             *       |   ||       |       ||       |       ||...                \
             *       |   +--------+-------+--------+-------+-...                \
             *       |   ||xxxxxxx|xxxxxxx||xxxxxxx|xxxxxxx||...                \
             *       |   ||       |       ||       |       ||...                \
             *       *   +========+=======+========+=======+                    \
             *           ||       |       ||       |       ||                   \
             */                                                                 \
            do {                                                                \
                int32_t              dstColCnt = srcWidth;                      \
                const ARM_PIX_SCLTYP(sz)  *pSource = pSourceBaseCur             \
                                                              + SRC_OFFSET;     \
                ARM_PIX_SCLTYP(sz)  *pDst00 = pDst;                             \
                ARM_PIX_SCLTYP(sz)  *pDst01 = pDst00 + srcWidth;                \
                ARM_PIX_SCLTYP(sz)  *pDst10 = pDst + srcHeight * iTargetStride; \
                ARM_PIX_SCLTYP(sz)  *pDst11 = pDst10 + srcWidth;                \
                                                                                \
                SETUP_MIRROR(srcWidth);                                         \
                                                                                \
                /* duplicate current source line into 2x2 destinations */       \
                do {                                                            \
                    mve_pred16_t        p =                                     \
                        ARM_CONNECT2(ARM_CONNECT2(vctp, sz),q)(dstColCnt);      \
                    ARM_PIX_VECTYP(sz)  in;                                     \
                                                                                \
                    in = LOAD(pSource, offset);                                 \
                    /* placeholder for color masking */                         \
                    __VA_ARGS__;                                                \
                                                                                \
                    vst1q_p(pDst00, in, p);                                     \
                    vst1q_p(pDst01, in, p);                                     \
                    vst1q_p(pDst10, in, p);                                     \
                    vst1q_p(pDst11, in, p);                                     \
                                                                                \
                    pDst00 += ARM_PIX_VECELT(sz);                               \
                    pDst01 += ARM_PIX_VECELT(sz);                               \
                    pDst10 += ARM_PIX_VECELT(sz);                               \
                    pDst11 += ARM_PIX_VECELT(sz);                               \
                    dstColCnt -= ARM_PIX_VECELT(sz);                            \
                }                                                               \
                while (dstColCnt > 0);                                          \
                                                                                \
                pDst += 2 * srcWidth;                                           \
                tilePairColsCnt--;                                              \
            }                                                                   \
            while (tilePairColsCnt != 0);                                       \
                                                                                \
            rowCnt ++;                                                          \
            pTargetBaseCur += iTargetStride;                                    \
            pSourceBaseCur += (iSourceStride * DIR);                            \
        }                                                                       \
        while (rowCnt < ptSourceSize->iHeight);                                 \
                                                                                \
        pTargetBaseCur += srcHeight * iTargetStride;                            \
        tilePairRows--;                                                         \
    }                                                                           \
    while (tilePairRows != 0);                                                  \
}


/* handle c8bit X or XY 1x2 mirroring when tile width > 256 */
/* uses 8-bit widened load allowing width up to 65K */
#define __ARM_2D_PAVING_1x2_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR,                \
                                                SETUP_MIRROR, LOAD, ...)        \
{                                                                               \
    /* row iteration */                                                         \
    /* handle pair of source image row and fill in the column direction */      \
    do {                                                                        \
        uint32_t        rowCnt = 0;                                             \
                                                                                \
        pSourceBaseCur = pSourceBase;                                           \
                                                                                \
        /* single source row loop */                                            \
        do {                                                                    \
            int32_t              dstColCnt = destWidth;                         \
            const uint8_t       *pSource = (const uint8_t*)pSourceBaseCur       \
                                                              + SRC_OFFSET;     \
            uint8_t            *pDst = (uint8_t*)pTargetBaseCur;                \
            uint8_t            *pDst00 = pDst;                                  \
            uint8_t            *pDst10 = pDst + srcHeight * iTargetStride;      \
                                                                                \
            /* column loop */                                                   \
            /* duplicate current source row into 1 x 2 destination across */    \
            /* destination columns */                                           \
            /*                                                                  \
             *  +-------+                                                       \
             *  |xxxxxxx|                                                       \
             *  |  src  |                                                       \
             *  +-------+                                                       \
             *                                                                  \
             *           ..||       ||                                          \
             *       *   ==+========+=...                                       \
             *       |   ..||xxxxxxx||...                                       \
             *       |   ..||       ||...                                       \
             *       |   ..+--------+-...                                       \
             *       |   ..||xxxxxxx||...                                       \
             *       |   ..||       ||...                                       \
             *       *   ==+========+=...                                       \
             *           ..||       ||...                                       \
             */                                                                 \
            uint32_t            curOffsetIdx = srcWidth - 1;                    \
            uint16x8_t          offset = vddupq_wb_u16(&curOffsetIdx, 1);       \
                                                                                \
            /* duplicate current source line into 2x2 destinations */           \
            do {                                                                \
                mve_pred16_t    p = vctp16q(dstColCnt);                         \
                uint8x16_t      in;                                             \
                in = vldrbq_gather_offset_z_u16(pSource, offset, p);            \
                offset = vddupq_x_wb_u16(&curOffsetIdx, 1, p);                  \
                                                                                \
                /* placeholder for color masking */                             \
                __VA_ARGS__;                                                    \
                                                                                \
                vstrbq_p_u16(pDst00, in, p);                                    \
                vstrbq_p_u16(pDst10, in, p);                                    \
                                                                                \
                pDst00 += 8;                                                    \
                pDst10 += 8;                                                    \
                dstColCnt -= 8;                                                 \
            }                                                                   \
            while (dstColCnt > 0);                                              \
                                                                                \
            rowCnt ++;                                                          \
            pTargetBaseCur += iTargetStride;                                    \
            pSourceBaseCur += (iSourceStride * DIR);                            \
        }                                                                       \
        while (rowCnt < ptSourceSize->iHeight);                                 \
                                                                                \
        pTargetBaseCur += srcHeight * iTargetStride;                            \
        tilePairRows--;                                                         \
    }                                                                           \
    while (tilePairRows != 0);                                                  \
}


#define __ARM_2D_PAVING_1x2(sz, SRC_OFFSET, DIR, SETUP_MIRROR, LOAD, ...)       \
{                                                                               \
    uint32_t            srcWidth = ptSourceSize->iWidth;                        \
    uint32_t            srcHeight = ptSourceSize->iHeight;                      \
    const ARM_PIX_SCLTYP(sz) *pSourceBaseCur;                                   \
    ARM_PIX_SCLTYP(sz) *pTargetBaseCur = pTargetBase;                           \
                                                                                \
    if ((sz == 8) && (srcWidth >= 256) && IS_##LOAD) {                          \
        /* special case for 8-bit and X & XY mirror  */                         \
        /* width does not fit in 8-bit, need widening */                        \
        /* will be optimized away for all other cases */                        \
        /* not executed unconditionally as slower */                            \
        __ARM_2D_PAVING_1x2_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR,                \
                            SETUP_MIRROR, LOAD, __VA_ARGS__)                    \
    } else                                                                      \
    /* row iteration */                                                         \
    /* handle pair of source image row and fill in the column direction */      \
    do {                                                                        \
        uint32_t        rowCnt = 0;                                             \
                                                                                \
        pSourceBaseCur = pSourceBase;                                           \
                                                                                \
        /* single source row loop */                                            \
        do {                                                                    \
            int32_t              dstColCnt = destWidth;                         \
            const ARM_PIX_SCLTYP(sz)  *pSource = pSourceBaseCur + SRC_OFFSET;   \
            ARM_PIX_SCLTYP(sz)  *pDst = pTargetBaseCur;                         \
            ARM_PIX_SCLTYP(sz)  *pDst00 = pDst;                                 \
            ARM_PIX_SCLTYP(sz)  *pDst10 = pDst + srcHeight * iTargetStride;     \
                                                                                \
            /* column loop */                                                   \
            /* duplicate current source row into 1 x 2 destination across */    \
            /* destination columns */                                           \
            /*                                                                  \
             *  +-------+                                                       \
             *  |xxxxxxx|                                                       \
             *  |  src  |                                                       \
             *  +-------+                                                       \
             *                                                                  \
             *           ..||       ||                                          \
             *       *   ==+========+=...                                       \
             *       |   ..||xxxxxxx||...                                       \
             *       |   ..||       ||...                                       \
             *       |   ..+--------+-...                                       \
             *       |   ..||xxxxxxx||...                                       \
             *       |   ..||       ||...                                       \
             *       *   ==+========+=...                                       \
             *           ..||       ||...                                       \
             */                                                                 \
            SETUP_MIRROR(srcWidth);                                             \
                                                                                \
            /* duplicate current source line into 2x2 destinations */           \
            do {                                                                \
                mve_pred16_t        p =                                         \
                    ARM_CONNECT2(ARM_CONNECT2(vctp, sz),q)(dstColCnt);          \
                ARM_PIX_VECTYP(sz)  in;                                         \
                                                                                \
                in = LOAD(pSource, offset);                                     \
                /* placeholder for color masking */                             \
                __VA_ARGS__;                                                    \
                                                                                \
                vst1q_p(pDst00, in, p);                                         \
                vst1q_p(pDst10, in, p);                                         \
                                                                                \
                pDst00 += ARM_PIX_VECELT(sz);                                   \
                pDst10 += ARM_PIX_VECELT(sz);                                   \
                dstColCnt -= ARM_PIX_VECELT(sz);                                \
            }                                                                   \
            while (dstColCnt > 0);                                              \
                                                                                \
            rowCnt ++;                                                          \
            pTargetBaseCur += iTargetStride;                                    \
            pSourceBaseCur += (iSourceStride * DIR);                            \
        }                                                                       \
        while (rowCnt < ptSourceSize->iHeight);                                 \
                                                                                \
        pTargetBaseCur += srcHeight * iTargetStride;                            \
        tilePairRows--;                                                         \
    }                                                                           \
    while (tilePairRows != 0);                                                  \
}

/* handle c8bit X or XY 2x1 mirroring when tile width > 256 */
/* uses 8-bit widened load allowing width up to 65K */
#define __ARM_2D_PAVING_2x1_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR,                \
                                                 SETUP_MIRROR, LOAD, ...)       \
{                                                                               \
                                                                                \
    /* row iteration */                                                         \
    /* handle pair of source image row and fill in the column direction */      \
    /*                                                                          \
     *  +-------+                                                               \
     *  | src   |                                                               \
     *  +-------+                                                               \
     *                                                                          \
     *                             tilePairCols                                 \
     *                  <---------------+--------........>                      \
     *                                                                          \
     *                  +=======+=======++=======+=======++......               \
     *                  ||      |       ||       |       ||                     \
     *                  +=======+=======++=======+=======++                     \
     *                  ........                                                \
     */                                                                         \
                                                                                \
    pSourceBaseCur = pSourceBase;                                               \
                                                                                \
    /* copy 2 x 2 source image block */                                         \
    do {                                                                        \
        uint8_t            *pDst = (uint8_t*)pTargetBaseCur;                    \
        int16_t   tilePairColsCnt = tilePairCols;                               \
                                                                                \
        do {                                                                    \
            int                 srcColCnt = srcWidth;                           \
            const uint8_t       *pSource = (const uint8_t*)pSourceBaseCur       \
                                                              + SRC_OFFSET;     \
            uint8_t *pDst00 = pDst;                                             \
            uint8_t *pDst01 = pDst00 + srcColCnt;                               \
                                                                                \
             uint32_t            curOffsetIdx = srcWidth - 1;                   \
            uint16x8_t          offset = vddupq_wb_u16(&curOffsetIdx, 1);       \
                                                                                \
            /* duplicate current source line into 4 destinations */             \
            do {                                                                \
                mve_pred16_t    p = vctp16q(srcColCnt);                         \
                uint8x16_t      in;                                             \
                in = vldrbq_gather_offset_z_u16(pSource, offset, p);            \
                offset = vddupq_x_wb_u16(&curOffsetIdx, 1, p);                  \
                /* placeholder for color masking */                             \
                __VA_ARGS__;                                                    \
                                                                                \
                vstrbq_p_u16(pDst00, in, p);                                    \
                vstrbq_p_u16(pDst01, in, p);                                    \
                                                                                \
                pDst00 +=  8;                                                   \
                pDst01 +=  8;                                                   \
                srcColCnt -=  8;                                                \
            }                                                                   \
            while ((int32_t) srcColCnt > 0);                                    \
                                                                                \
            pDst += 2 * srcWidth;                                               \
            tilePairColsCnt--;                                                  \
        }                                                                       \
        while (tilePairColsCnt != 0);                                           \
                                                                                \
        rowCnt++;                                                               \
        pTargetBaseCur += iTargetStride;                                        \
        pSourceBaseCur += (iSourceStride * DIR);                                \
    }                                                                           \
    while (rowCnt < destHeight);                                                \
}


#define __ARM_2D_PAVING_2x1(sz, SRC_OFFSET, DIR, SETUP_MIRROR, LOAD, ...)       \
{                                                                               \
    uint32_t             srcWidth = ptSourceSize->iWidth;                       \
    const ARM_PIX_SCLTYP(sz)  *pSourceBaseCur;                                  \
    ARM_PIX_SCLTYP(sz)  *pTargetBaseCur = pTargetBase;                          \
    uint32_t             rowCnt = 0;                                            \
                                                                                \
    if ((sz == 8) && (srcWidth >= 256) && IS_##LOAD) {                          \
        /* special case for 8-bit and X & XY mirror  */                         \
        /* width does not fit in 8-bit, need widening */                        \
        /* will be optimized away for all other cases */                        \
        /* not executed unconditionally as slower */                            \
        __ARM_2D_PAVING_2x1_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR,                \
                            SETUP_MIRROR, LOAD, __VA_ARGS__)                    \
    } else {                                                                    \
    /* row iteration */                                                         \
    /* handle pair of source image row and fill in the column direction */      \
    /*                                                                          \
     *  +-------+                                                               \
     *  | src   |                                                               \
     *  +-------+                                                               \
     *                                                                          \
     *                             tilePairCols                                 \
     *                  <---------------+--------........>                      \
     *                                                                          \
     *                  +=======+=======++=======+=======++......               \
     *                  ||      |       ||       |       ||                     \
     *                  +=======+=======++=======+=======++                     \
     *                  ........                                                \
     */                                                                         \
                                                                                \
    pSourceBaseCur = pSourceBase;                                               \
                                                                                \
    /* copy 2 x 2 source image block */                                         \
    do {                                                                        \
        ARM_PIX_SCLTYP(sz)  *pDst = pTargetBaseCur;                             \
        ARM_PIX_SCLTYP(sz)   tilePairColsCnt = tilePairCols;                    \
                                                                                \
        do {                                                                    \
            int                 srcColCnt = srcWidth;                           \
            const ARM_PIX_SCLTYP(sz) *pSource = pSourceBaseCur + SRC_OFFSET;    \
            ARM_PIX_SCLTYP(sz) *pDst00 = pDst;                                  \
            ARM_PIX_SCLTYP(sz) *pDst01 = pDst00 + srcColCnt;                    \
                                                                                \
            SETUP_MIRROR(srcWidth);                                             \
                                                                                \
            /* duplicate current source line into 4 destinations */             \
            do {                                                                \
                mve_pred16_t        p =                                         \
                    ARM_CONNECT2(ARM_CONNECT2(vctp, sz),q)(srcColCnt);          \
                ARM_PIX_VECTYP(sz)  in;                                         \
                                                                                \
                in = LOAD(pSource, offset);                                     \
                /* placeholder for color masking */                             \
                __VA_ARGS__;                                                    \
                                                                                \
                vst1q_p(pDst00, in, p);                                         \
                vst1q_p(pDst01, in, p);                                         \
                                                                                \
                pDst00 += ARM_PIX_VECELT(sz);                                   \
                pDst01 += ARM_PIX_VECELT(sz);                                   \
                srcColCnt -= ARM_PIX_VECELT(sz);                                \
            }                                                                   \
            while ((int32_t) srcColCnt > 0);                                    \
                                                                                \
            pDst += 2 * srcWidth;                                               \
            tilePairColsCnt--;                                                  \
        }                                                                       \
        while (tilePairColsCnt != 0);                                           \
                                                                                \
        rowCnt++;                                                               \
        pTargetBaseCur += iTargetStride;                                        \
        pSourceBaseCur += (iSourceStride * DIR);                                \
    }                                                                           \
    while (rowCnt < destHeight);                                                \
    }                                                                           \
}


/* handle c8bit X or XY 1x1 mirroring when tile width > 256 */
/* uses 8-bit widened load allowing width up to 65K */
#define __ARM_2D_PAVING_1x1_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR,                \
                                                 SETUP_MIRROR, LOAD, ...)       \
{                                                                               \
    pSource += SRC_OFFSET;                                                      \
                                                                                \
    for (int_fast16_t y = 0; y < ptDstCopySize->iHeight; y++) {                 \
        uint8_t         *pDst = (uint8_t  *)pTarget;                            \
        const uint8_t   *pSrc = (const uint8_t*)pSource;                        \
        uint32_t         dstWidth = ptDstCopySize->iWidth;                      \
        uint32_t         curOffsetIdx = srcWidth - 1;                           \
        uint16x8_t       offset = vddupq_wb_u16(&curOffsetIdx, 1);              \
                                                                                \
        do {                                                                    \
            mve_pred16_t    p = vctp16q(dstWidth);                              \
            uint8x16_t      in;                                                 \
            in = vldrbq_gather_offset_z_u16(pSrc, offset, p);                   \
            offset = vddupq_x_wb_u16(&curOffsetIdx, 1, p);                      \
            /* placeholder for color masking */                                 \
            __VA_ARGS__;                                                        \
                                                                                \
            vstrbq_p_u16(pDst, in, p);                                          \
                                                                                \
            pDst += 8;                                                          \
            dstWidth -= 8;                                                      \
        }                                                                       \
        while ((int32_t) dstWidth > 0);                                         \
                                                                                \
        pSource += (iSourceStride * DIR);                                       \
        pTarget += iTargetStride;                                               \
    }                                                                           \
}


#define __ARM_2D_PAVING_1x1(sz, SRC_OFFSET, DIR, SETUP_MIRROR, LOAD, ...)       \
{                                                                               \
    uint32_t             srcWidth = ptSrcCopySize->iWidth;                      \
    if ((sz == 8) && (srcWidth >= 256) && IS_##LOAD) {                          \
        /* special case for 8-bit and X & XY mirror  */                         \
        /* width does not fit in 8-bit, need widening */                        \
        /* will be optimized away for all other cases */                        \
        /* not executed unconditionally as slower */                            \
        __ARM_2D_PAVING_1x1_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR,                \
                            SETUP_MIRROR, LOAD, __VA_ARGS__)                    \
    } else {                                                                    \
        pSource += SRC_OFFSET;                                                  \
                                                                                \
        for (int_fast16_t y = 0; y < ptDstCopySize->iHeight; y++) {             \
            ARM_PIX_SCLTYP(sz)  *pDst = pTarget;                                \
            ARM_PIX_SCLTYP(sz)  *pSrc = pSource;                                \
            uint32_t             dstWidth = ptDstCopySize->iWidth;              \
                                                                                \
            SETUP_MIRROR(srcWidth);                                             \
                                                                                \
            do {                                                                \
                mve_pred16_t        p =                                         \
                    ARM_CONNECT2(ARM_CONNECT2(vctp, sz), q) (dstWidth);         \
                ARM_PIX_VECTYP(sz)  in;                                         \
                                                                                \
                in = LOAD(pSrc, offset);                                        \
                /* placeholder for color masking */                             \
                __VA_ARGS__;                                                    \
                                                                                \
                vst1q_p(pDst, in, p);                                           \
                                                                                \
                pDst += ARM_PIX_VECELT(sz);                                     \
                dstWidth -= ARM_PIX_VECELT(sz);                                 \
            }                                                                   \
            while ((int32_t) dstWidth > 0);                                     \
                                                                                \
            pSource += (iSourceStride * DIR);                                   \
            pTarget += iTargetStride;                                           \
        }                                                                       \
    }                                                                           \
}

/*============================ TYPES =========================================*/
/*============================ GLOBAL VARIABLES ==============================*/
/*============================ PROTOTYPES ====================================*/

#ifdef   __cplusplus
}
#endif

#endif