pikapython/package/Arm2D/__arm_2d_paving_helium.h
2021-11-09 22:19:51 +08:00

705 lines
49 KiB
C

/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: Arm-2D Library
* Title: arm-2d_paving_helium.h
* Description: Provides definitions and code templates for generic paving
*
* $Date: 17. Sep 2021
* $Revision: V 0.6.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#ifndef __ARM_2D_PAVING_HELIUM_H__
#define __ARM_2D_PAVING_HELIUM_H__
/*============================ INCLUDES ======================================*/
#include "arm_2d.h"
#include "__arm_2d_paving.h"
#ifdef __cplusplus
extern "C" {
#endif
/*============================ MACROS ========================================*/
/*============================ MACROFIED FUNCTIONS ===========================*/
#undef LOAD_SRC_DIRECT_16
#undef LOAD_SRC_DIRECT_32
#undef LOAD_SRC_X_MIRROR_16
#undef LOAD_SRC_X_MIRROR_32
#undef SETUP_MIRROR_COPY_16
#undef SETUP_MIRROR_COPY_32
#undef __ARM_2D_PAVING_2x2
#undef __ARM_2D_PAVING_1x2
#undef __ARM_2D_PAVING_2x1
#undef __ARM_2D_PAVING_1x1
/* Set predicate to true if vector different than color mask */
#define CMP_CL_MSK(sz, colour) p = vcmpneq_m(in, colour, p)
#undef LOAD_SRC_DIRECT_8
#undef LOAD_SRC_DIRECT_16
#undef LOAD_SRC_DIRECT_32
#undef LOAD_SRC_X_MIRROR_8
#undef LOAD_SRC_X_MIRROR_16
#undef LOAD_SRC_X_MIRROR_32
#undef SETUP_MIRROR_COPY_8
#undef SETUP_MIRROR_COPY_16
#undef SETUP_MIRROR_COPY_32
#define LOAD_SRC_DIRECT_8(pSource, offset) \
vld1q_z(pSource, p); \
pSource += 16;
#define LOAD_SRC_DIRECT_16(pSource, offset) \
vld1q_z(pSource, p); \
pSource += 8;
#define LOAD_SRC_DIRECT_32(pSource, offset) \
vld1q_z(pSource, p); \
pSource += 4;
#define LOAD_SRC_X_MIRROR_8(pSource, offset) \
vldrbq_gather_offset_z(pSource, offset, p); \
offset = vddupq_x_wb_u8(&curOffsetIdx, 1, p);
#define LOAD_SRC_X_MIRROR_16(pSource, offset) \
vldrhq_gather_shifted_offset_z(pSource, offset, p); \
offset = vddupq_x_wb_u16(&curOffsetIdx, 1, p);
#define LOAD_SRC_X_MIRROR_32(pSource, offset) \
vldrwq_gather_shifted_offset_z(pSource, offset, p); \
offset = vddupq_x_wb_u32(&curOffsetIdx, 1, p);
/* prepare Helium gather load offset */
#define SETUP_MIRROR_COPY_8(srcWidth) \
uint32_t curOffsetIdx = srcWidth - 1; \
uint8x16_t offset = vddupq_wb_u8(&curOffsetIdx, 1);
#define SETUP_MIRROR_COPY_16(srcWidth) \
uint32_t curOffsetIdx = srcWidth - 1; \
uint16x8_t offset = vddupq_wb_u16(&curOffsetIdx, 1);
#define SETUP_MIRROR_COPY_32(srcWidth) \
uint32_t curOffsetIdx = srcWidth - 1; \
uint32x4_t offset = vddupq_wb_u32(&curOffsetIdx, 1);
/* Macros intercepting X or XY mirroring */
#define IS_PAVING_DIRECT_LOAD_PATTERN(sz) (0)
#define IS_PAVING_X_MIRROR_LOAD_PATTERN(sz) ((sz == 8)? 1: 0)
#define IS_PAVING_Y_MIRROR_LOAD_PATTERN(sz) (0)
#define IS_PAVING_XY_MIRROR_LOAD_PATTERN(sz) ((sz == 8)? 1: 0)
/* handle c8bit X or XY 2x2 mirroring when tile width > 256 */
/* uses 8-bit widened load allowing width up to 65K */
#define __ARM_2D_PAVING_2x2_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR, \
SETUP_MIRROR, LOAD, ...) \
{ \
/* row iteration */ \
/* handle pair of source image row and fill in the column direction */ \
do { \
uint32_t rowCnt = 0; \
\
pSourceBaseCur = pSourceBase; \
\
/* single source row loop */ \
do { \
uint8_t *pDst = (uint8_t*)pTargetBaseCur; \
uint32_t tilePairColsCnt = tilePairCols; \
\
/* column loop */ \
/* duplicate current source row into 2 x 2 destination across */ \
/*destination columns */ \
/* \
* +-------+ \
* |xxxxxxx| \
* | src | \
* +-------+ \
* \
* <---------------> \
* \
* || | || | || \
* * +========+=======+========+=======+ \
* | ||xxxxxxx|xxxxxxx||xxxxxxx|xxxxxxx||... \
* | || | || | ||... \
* | +--------+-------+--------+-------+-... \
* | ||xxxxxxx|xxxxxxx||xxxxxxx|xxxxxxx||... \
* | || | || | ||... \
* * +========+=======+========+=======+ \
* || | || | || \
*/ \
do { \
int32_t dstColCnt = srcWidth; \
const uint8_t *pSource = (const uint8_t*)pSourceBaseCur \
+ SRC_OFFSET; \
uint8_t *pDst00 = (uint8_t*)pDst; \
uint8_t *pDst01 = (uint8_t*)pDst00 + srcWidth; \
uint8_t *pDst10 = (uint8_t*)pDst + srcHeight * \
iTargetStride; \
uint8_t *pDst11 = (uint8_t*)pDst10 + srcWidth; \
\
uint32_t curOffsetIdx = srcWidth - 1; \
uint16x8_t offset = vddupq_wb_u16(&curOffsetIdx, 1); \
\
/* duplicate current source line into 2x2 destinations */ \
do { \
mve_pred16_t p = vctp16q(dstColCnt); \
uint8x16_t in; \
in = vldrbq_gather_offset_z_u16(pSource, offset, p); \
offset = vddupq_x_wb_u16(&curOffsetIdx, 1, p); \
\
/* placeholder for color masking */ \
__VA_ARGS__; \
\
vstrbq_p_u16(pDst00, in, p); \
vstrbq_p_u16(pDst01, in, p); \
vstrbq_p_u16(pDst10, in, p); \
vstrbq_p_u16(pDst11, in, p); \
\
pDst00 += 8; \
pDst01 += 8; \
pDst10 += 8; \
pDst11 += 8; \
dstColCnt -= 8; \
} \
while (dstColCnt > 0); \
\
pDst += 2 * srcWidth; \
tilePairColsCnt--; \
} \
while (tilePairColsCnt != 0); \
\
rowCnt ++; \
pTargetBaseCur += iTargetStride; \
pSourceBaseCur += (iSourceStride * DIR); \
} \
while (rowCnt < ptSourceSize->iHeight); \
\
pTargetBaseCur += srcHeight * iTargetStride; \
tilePairRows--; \
} \
while (tilePairRows != 0); \
}
#define __ARM_2D_PAVING_2x2(sz, SRC_OFFSET, DIR, SETUP_MIRROR, LOAD, ...) \
{ \
uint32_t srcWidth = ptSourceSize->iWidth; \
uint32_t srcHeight = ptSourceSize->iHeight; \
const ARM_PIX_SCLTYP(sz) *pSourceBaseCur; \
ARM_PIX_SCLTYP(sz) *pTargetBaseCur = pTargetBase; \
\
if ((sz == 8) && (srcWidth >= 256) && IS_##LOAD) { \
/* special case for 8-bit and X & XY mirror */ \
/* width does not fit in 8-bit, need widening */ \
/* will be optimized away for all other cases */ \
/* not executed unconditionally as slower */ \
__ARM_2D_PAVING_2x2_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR, \
SETUP_MIRROR, LOAD, __VA_ARGS__) \
} else \
/* row iteration */ \
/* handle pair of source image row and fill in the column direction */ \
do { \
uint32_t rowCnt = 0; \
\
pSourceBaseCur = pSourceBase; \
\
/* single source row loop */ \
do { \
ARM_PIX_SCLTYP(sz) *pDst = pTargetBaseCur; \
uint32_t tilePairColsCnt = tilePairCols; \
\
/* column loop */ \
/* duplicate current source row into 2 x 2 destination across */ \
/*destination columns */ \
/* \
* +-------+ \
* |xxxxxxx| \
* | src | \
* +-------+ \
* \
* <---------------> \
* \
* || | || | || \
* * +========+=======+========+=======+ \
* | ||xxxxxxx|xxxxxxx||xxxxxxx|xxxxxxx||... \
* | || | || | ||... \
* | +--------+-------+--------+-------+-... \
* | ||xxxxxxx|xxxxxxx||xxxxxxx|xxxxxxx||... \
* | || | || | ||... \
* * +========+=======+========+=======+ \
* || | || | || \
*/ \
do { \
int32_t dstColCnt = srcWidth; \
const ARM_PIX_SCLTYP(sz) *pSource = pSourceBaseCur \
+ SRC_OFFSET; \
ARM_PIX_SCLTYP(sz) *pDst00 = pDst; \
ARM_PIX_SCLTYP(sz) *pDst01 = pDst00 + srcWidth; \
ARM_PIX_SCLTYP(sz) *pDst10 = pDst + srcHeight * iTargetStride; \
ARM_PIX_SCLTYP(sz) *pDst11 = pDst10 + srcWidth; \
\
SETUP_MIRROR(srcWidth); \
\
/* duplicate current source line into 2x2 destinations */ \
do { \
mve_pred16_t p = \
ARM_CONNECT2(ARM_CONNECT2(vctp, sz),q)(dstColCnt); \
ARM_PIX_VECTYP(sz) in; \
\
in = LOAD(pSource, offset); \
/* placeholder for color masking */ \
__VA_ARGS__; \
\
vst1q_p(pDst00, in, p); \
vst1q_p(pDst01, in, p); \
vst1q_p(pDst10, in, p); \
vst1q_p(pDst11, in, p); \
\
pDst00 += ARM_PIX_VECELT(sz); \
pDst01 += ARM_PIX_VECELT(sz); \
pDst10 += ARM_PIX_VECELT(sz); \
pDst11 += ARM_PIX_VECELT(sz); \
dstColCnt -= ARM_PIX_VECELT(sz); \
} \
while (dstColCnt > 0); \
\
pDst += 2 * srcWidth; \
tilePairColsCnt--; \
} \
while (tilePairColsCnt != 0); \
\
rowCnt ++; \
pTargetBaseCur += iTargetStride; \
pSourceBaseCur += (iSourceStride * DIR); \
} \
while (rowCnt < ptSourceSize->iHeight); \
\
pTargetBaseCur += srcHeight * iTargetStride; \
tilePairRows--; \
} \
while (tilePairRows != 0); \
}
/* handle c8bit X or XY 1x2 mirroring when tile width > 256 */
/* uses 8-bit widened load allowing width up to 65K */
#define __ARM_2D_PAVING_1x2_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR, \
SETUP_MIRROR, LOAD, ...) \
{ \
/* row iteration */ \
/* handle pair of source image row and fill in the column direction */ \
do { \
uint32_t rowCnt = 0; \
\
pSourceBaseCur = pSourceBase; \
\
/* single source row loop */ \
do { \
int32_t dstColCnt = destWidth; \
const uint8_t *pSource = (const uint8_t*)pSourceBaseCur \
+ SRC_OFFSET; \
uint8_t *pDst = (uint8_t*)pTargetBaseCur; \
uint8_t *pDst00 = pDst; \
uint8_t *pDst10 = pDst + srcHeight * iTargetStride; \
\
/* column loop */ \
/* duplicate current source row into 1 x 2 destination across */ \
/* destination columns */ \
/* \
* +-------+ \
* |xxxxxxx| \
* | src | \
* +-------+ \
* \
* ..|| || \
* * ==+========+=... \
* | ..||xxxxxxx||... \
* | ..|| ||... \
* | ..+--------+-... \
* | ..||xxxxxxx||... \
* | ..|| ||... \
* * ==+========+=... \
* ..|| ||... \
*/ \
uint32_t curOffsetIdx = srcWidth - 1; \
uint16x8_t offset = vddupq_wb_u16(&curOffsetIdx, 1); \
\
/* duplicate current source line into 2x2 destinations */ \
do { \
mve_pred16_t p = vctp16q(dstColCnt); \
uint8x16_t in; \
in = vldrbq_gather_offset_z_u16(pSource, offset, p); \
offset = vddupq_x_wb_u16(&curOffsetIdx, 1, p); \
\
/* placeholder for color masking */ \
__VA_ARGS__; \
\
vstrbq_p_u16(pDst00, in, p); \
vstrbq_p_u16(pDst10, in, p); \
\
pDst00 += 8; \
pDst10 += 8; \
dstColCnt -= 8; \
} \
while (dstColCnt > 0); \
\
rowCnt ++; \
pTargetBaseCur += iTargetStride; \
pSourceBaseCur += (iSourceStride * DIR); \
} \
while (rowCnt < ptSourceSize->iHeight); \
\
pTargetBaseCur += srcHeight * iTargetStride; \
tilePairRows--; \
} \
while (tilePairRows != 0); \
}
#define __ARM_2D_PAVING_1x2(sz, SRC_OFFSET, DIR, SETUP_MIRROR, LOAD, ...) \
{ \
uint32_t srcWidth = ptSourceSize->iWidth; \
uint32_t srcHeight = ptSourceSize->iHeight; \
const ARM_PIX_SCLTYP(sz) *pSourceBaseCur; \
ARM_PIX_SCLTYP(sz) *pTargetBaseCur = pTargetBase; \
\
if ((sz == 8) && (srcWidth >= 256) && IS_##LOAD) { \
/* special case for 8-bit and X & XY mirror */ \
/* width does not fit in 8-bit, need widening */ \
/* will be optimized away for all other cases */ \
/* not executed unconditionally as slower */ \
__ARM_2D_PAVING_1x2_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR, \
SETUP_MIRROR, LOAD, __VA_ARGS__) \
} else \
/* row iteration */ \
/* handle pair of source image row and fill in the column direction */ \
do { \
uint32_t rowCnt = 0; \
\
pSourceBaseCur = pSourceBase; \
\
/* single source row loop */ \
do { \
int32_t dstColCnt = destWidth; \
const ARM_PIX_SCLTYP(sz) *pSource = pSourceBaseCur + SRC_OFFSET; \
ARM_PIX_SCLTYP(sz) *pDst = pTargetBaseCur; \
ARM_PIX_SCLTYP(sz) *pDst00 = pDst; \
ARM_PIX_SCLTYP(sz) *pDst10 = pDst + srcHeight * iTargetStride; \
\
/* column loop */ \
/* duplicate current source row into 1 x 2 destination across */ \
/* destination columns */ \
/* \
* +-------+ \
* |xxxxxxx| \
* | src | \
* +-------+ \
* \
* ..|| || \
* * ==+========+=... \
* | ..||xxxxxxx||... \
* | ..|| ||... \
* | ..+--------+-... \
* | ..||xxxxxxx||... \
* | ..|| ||... \
* * ==+========+=... \
* ..|| ||... \
*/ \
SETUP_MIRROR(srcWidth); \
\
/* duplicate current source line into 2x2 destinations */ \
do { \
mve_pred16_t p = \
ARM_CONNECT2(ARM_CONNECT2(vctp, sz),q)(dstColCnt); \
ARM_PIX_VECTYP(sz) in; \
\
in = LOAD(pSource, offset); \
/* placeholder for color masking */ \
__VA_ARGS__; \
\
vst1q_p(pDst00, in, p); \
vst1q_p(pDst10, in, p); \
\
pDst00 += ARM_PIX_VECELT(sz); \
pDst10 += ARM_PIX_VECELT(sz); \
dstColCnt -= ARM_PIX_VECELT(sz); \
} \
while (dstColCnt > 0); \
\
rowCnt ++; \
pTargetBaseCur += iTargetStride; \
pSourceBaseCur += (iSourceStride * DIR); \
} \
while (rowCnt < ptSourceSize->iHeight); \
\
pTargetBaseCur += srcHeight * iTargetStride; \
tilePairRows--; \
} \
while (tilePairRows != 0); \
}
/* handle c8bit X or XY 2x1 mirroring when tile width > 256 */
/* uses 8-bit widened load allowing width up to 65K */
#define __ARM_2D_PAVING_2x1_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR, \
SETUP_MIRROR, LOAD, ...) \
{ \
\
/* row iteration */ \
/* handle pair of source image row and fill in the column direction */ \
/* \
* +-------+ \
* | src | \
* +-------+ \
* \
* tilePairCols \
* <---------------+--------........> \
* \
* +=======+=======++=======+=======++...... \
* || | || | || \
* +=======+=======++=======+=======++ \
* ........ \
*/ \
\
pSourceBaseCur = pSourceBase; \
\
/* copy 2 x 2 source image block */ \
do { \
uint8_t *pDst = (uint8_t*)pTargetBaseCur; \
int16_t tilePairColsCnt = tilePairCols; \
\
do { \
int srcColCnt = srcWidth; \
const uint8_t *pSource = (const uint8_t*)pSourceBaseCur \
+ SRC_OFFSET; \
uint8_t *pDst00 = pDst; \
uint8_t *pDst01 = pDst00 + srcColCnt; \
\
uint32_t curOffsetIdx = srcWidth - 1; \
uint16x8_t offset = vddupq_wb_u16(&curOffsetIdx, 1); \
\
/* duplicate current source line into 4 destinations */ \
do { \
mve_pred16_t p = vctp16q(srcColCnt); \
uint8x16_t in; \
in = vldrbq_gather_offset_z_u16(pSource, offset, p); \
offset = vddupq_x_wb_u16(&curOffsetIdx, 1, p); \
/* placeholder for color masking */ \
__VA_ARGS__; \
\
vstrbq_p_u16(pDst00, in, p); \
vstrbq_p_u16(pDst01, in, p); \
\
pDst00 += 8; \
pDst01 += 8; \
srcColCnt -= 8; \
} \
while ((int32_t) srcColCnt > 0); \
\
pDst += 2 * srcWidth; \
tilePairColsCnt--; \
} \
while (tilePairColsCnt != 0); \
\
rowCnt++; \
pTargetBaseCur += iTargetStride; \
pSourceBaseCur += (iSourceStride * DIR); \
} \
while (rowCnt < destHeight); \
}
#define __ARM_2D_PAVING_2x1(sz, SRC_OFFSET, DIR, SETUP_MIRROR, LOAD, ...) \
{ \
uint32_t srcWidth = ptSourceSize->iWidth; \
const ARM_PIX_SCLTYP(sz) *pSourceBaseCur; \
ARM_PIX_SCLTYP(sz) *pTargetBaseCur = pTargetBase; \
uint32_t rowCnt = 0; \
\
if ((sz == 8) && (srcWidth >= 256) && IS_##LOAD) { \
/* special case for 8-bit and X & XY mirror */ \
/* width does not fit in 8-bit, need widening */ \
/* will be optimized away for all other cases */ \
/* not executed unconditionally as slower */ \
__ARM_2D_PAVING_2x1_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR, \
SETUP_MIRROR, LOAD, __VA_ARGS__) \
} else { \
/* row iteration */ \
/* handle pair of source image row and fill in the column direction */ \
/* \
* +-------+ \
* | src | \
* +-------+ \
* \
* tilePairCols \
* <---------------+--------........> \
* \
* +=======+=======++=======+=======++...... \
* || | || | || \
* +=======+=======++=======+=======++ \
* ........ \
*/ \
\
pSourceBaseCur = pSourceBase; \
\
/* copy 2 x 2 source image block */ \
do { \
ARM_PIX_SCLTYP(sz) *pDst = pTargetBaseCur; \
ARM_PIX_SCLTYP(sz) tilePairColsCnt = tilePairCols; \
\
do { \
int srcColCnt = srcWidth; \
const ARM_PIX_SCLTYP(sz) *pSource = pSourceBaseCur + SRC_OFFSET; \
ARM_PIX_SCLTYP(sz) *pDst00 = pDst; \
ARM_PIX_SCLTYP(sz) *pDst01 = pDst00 + srcColCnt; \
\
SETUP_MIRROR(srcWidth); \
\
/* duplicate current source line into 4 destinations */ \
do { \
mve_pred16_t p = \
ARM_CONNECT2(ARM_CONNECT2(vctp, sz),q)(srcColCnt); \
ARM_PIX_VECTYP(sz) in; \
\
in = LOAD(pSource, offset); \
/* placeholder for color masking */ \
__VA_ARGS__; \
\
vst1q_p(pDst00, in, p); \
vst1q_p(pDst01, in, p); \
\
pDst00 += ARM_PIX_VECELT(sz); \
pDst01 += ARM_PIX_VECELT(sz); \
srcColCnt -= ARM_PIX_VECELT(sz); \
} \
while ((int32_t) srcColCnt > 0); \
\
pDst += 2 * srcWidth; \
tilePairColsCnt--; \
} \
while (tilePairColsCnt != 0); \
\
rowCnt++; \
pTargetBaseCur += iTargetStride; \
pSourceBaseCur += (iSourceStride * DIR); \
} \
while (rowCnt < destHeight); \
} \
}
/* handle c8bit X or XY 1x1 mirroring when tile width > 256 */
/* uses 8-bit widened load allowing width up to 65K */
#define __ARM_2D_PAVING_1x1_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR, \
SETUP_MIRROR, LOAD, ...) \
{ \
pSource += SRC_OFFSET; \
\
for (int_fast16_t y = 0; y < ptDstCopySize->iHeight; y++) { \
uint8_t *pDst = (uint8_t *)pTarget; \
const uint8_t *pSrc = (const uint8_t*)pSource; \
uint32_t dstWidth = ptDstCopySize->iWidth; \
uint32_t curOffsetIdx = srcWidth - 1; \
uint16x8_t offset = vddupq_wb_u16(&curOffsetIdx, 1); \
\
do { \
mve_pred16_t p = vctp16q(dstWidth); \
uint8x16_t in; \
in = vldrbq_gather_offset_z_u16(pSrc, offset, p); \
offset = vddupq_x_wb_u16(&curOffsetIdx, 1, p); \
/* placeholder for color masking */ \
__VA_ARGS__; \
\
vstrbq_p_u16(pDst, in, p); \
\
pDst += 8; \
dstWidth -= 8; \
} \
while ((int32_t) dstWidth > 0); \
\
pSource += (iSourceStride * DIR); \
pTarget += iTargetStride; \
} \
}
#define __ARM_2D_PAVING_1x1(sz, SRC_OFFSET, DIR, SETUP_MIRROR, LOAD, ...) \
{ \
uint32_t srcWidth = ptSrcCopySize->iWidth; \
if ((sz == 8) && (srcWidth >= 256) && IS_##LOAD) { \
/* special case for 8-bit and X & XY mirror */ \
/* width does not fit in 8-bit, need widening */ \
/* will be optimized away for all other cases */ \
/* not executed unconditionally as slower */ \
__ARM_2D_PAVING_1x1_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR, \
SETUP_MIRROR, LOAD, __VA_ARGS__) \
} else { \
pSource += SRC_OFFSET; \
\
for (int_fast16_t y = 0; y < ptDstCopySize->iHeight; y++) { \
ARM_PIX_SCLTYP(sz) *pDst = pTarget; \
ARM_PIX_SCLTYP(sz) *pSrc = pSource; \
uint32_t dstWidth = ptDstCopySize->iWidth; \
\
SETUP_MIRROR(srcWidth); \
\
do { \
mve_pred16_t p = \
ARM_CONNECT2(ARM_CONNECT2(vctp, sz), q) (dstWidth); \
ARM_PIX_VECTYP(sz) in; \
\
in = LOAD(pSrc, offset); \
/* placeholder for color masking */ \
__VA_ARGS__; \
\
vst1q_p(pDst, in, p); \
\
pDst += ARM_PIX_VECELT(sz); \
dstWidth -= ARM_PIX_VECELT(sz); \
} \
while ((int32_t) dstWidth > 0); \
\
pSource += (iSourceStride * DIR); \
pTarget += iTargetStride; \
} \
} \
}
/*============================ TYPES =========================================*/
/*============================ GLOBAL VARIABLES ==============================*/
/*============================ PROTOTYPES ====================================*/
#ifdef __cplusplus
}
#endif
#endif