mirror of
https://gitee.com/Lyon1998/pikapython.git
synced 2025-01-22 17:12:55 +08:00
705 lines
49 KiB
C
705 lines
49 KiB
C
/*
|
|
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the License); you may
|
|
* not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
|
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
/* ----------------------------------------------------------------------
|
|
* Project: Arm-2D Library
|
|
* Title: arm-2d_paving_helium.h
|
|
* Description: Provides definitions and code templates for generic paving
|
|
*
|
|
* $Date: 17. Sep 2021
|
|
* $Revision: V 0.6.0
|
|
*
|
|
* Target Processor: Cortex-M cores
|
|
*
|
|
* -------------------------------------------------------------------- */
|
|
|
|
|
|
#ifndef __ARM_2D_PAVING_HELIUM_H__
|
|
#define __ARM_2D_PAVING_HELIUM_H__
|
|
|
|
/*============================ INCLUDES ======================================*/
|
|
#include "arm_2d.h"
|
|
#include "__arm_2d_paving.h"
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
/*============================ MACROS ========================================*/
|
|
/*============================ MACROFIED FUNCTIONS ===========================*/
|
|
|
|
|
|
#undef LOAD_SRC_DIRECT_16
|
|
#undef LOAD_SRC_DIRECT_32
|
|
#undef LOAD_SRC_X_MIRROR_16
|
|
#undef LOAD_SRC_X_MIRROR_32
|
|
#undef SETUP_MIRROR_COPY_16
|
|
#undef SETUP_MIRROR_COPY_32
|
|
#undef __ARM_2D_PAVING_2x2
|
|
#undef __ARM_2D_PAVING_1x2
|
|
#undef __ARM_2D_PAVING_2x1
|
|
#undef __ARM_2D_PAVING_1x1
|
|
|
|
|
|
|
|
/* Set predicate to true if vector different than color mask */
|
|
#define CMP_CL_MSK(sz, colour) p = vcmpneq_m(in, colour, p)
|
|
|
|
|
|
#undef LOAD_SRC_DIRECT_8
|
|
#undef LOAD_SRC_DIRECT_16
|
|
#undef LOAD_SRC_DIRECT_32
|
|
#undef LOAD_SRC_X_MIRROR_8
|
|
#undef LOAD_SRC_X_MIRROR_16
|
|
#undef LOAD_SRC_X_MIRROR_32
|
|
#undef SETUP_MIRROR_COPY_8
|
|
#undef SETUP_MIRROR_COPY_16
|
|
#undef SETUP_MIRROR_COPY_32
|
|
|
|
|
|
#define LOAD_SRC_DIRECT_8(pSource, offset) \
|
|
vld1q_z(pSource, p); \
|
|
pSource += 16;
|
|
|
|
#define LOAD_SRC_DIRECT_16(pSource, offset) \
|
|
vld1q_z(pSource, p); \
|
|
pSource += 8;
|
|
|
|
#define LOAD_SRC_DIRECT_32(pSource, offset) \
|
|
vld1q_z(pSource, p); \
|
|
pSource += 4;
|
|
|
|
#define LOAD_SRC_X_MIRROR_8(pSource, offset) \
|
|
vldrbq_gather_offset_z(pSource, offset, p); \
|
|
offset = vddupq_x_wb_u8(&curOffsetIdx, 1, p);
|
|
|
|
#define LOAD_SRC_X_MIRROR_16(pSource, offset) \
|
|
vldrhq_gather_shifted_offset_z(pSource, offset, p); \
|
|
offset = vddupq_x_wb_u16(&curOffsetIdx, 1, p);
|
|
|
|
#define LOAD_SRC_X_MIRROR_32(pSource, offset) \
|
|
vldrwq_gather_shifted_offset_z(pSource, offset, p); \
|
|
offset = vddupq_x_wb_u32(&curOffsetIdx, 1, p);
|
|
|
|
/* prepare Helium gather load offset */
|
|
#define SETUP_MIRROR_COPY_8(srcWidth) \
|
|
uint32_t curOffsetIdx = srcWidth - 1; \
|
|
uint8x16_t offset = vddupq_wb_u8(&curOffsetIdx, 1);
|
|
|
|
#define SETUP_MIRROR_COPY_16(srcWidth) \
|
|
uint32_t curOffsetIdx = srcWidth - 1; \
|
|
uint16x8_t offset = vddupq_wb_u16(&curOffsetIdx, 1);
|
|
|
|
#define SETUP_MIRROR_COPY_32(srcWidth) \
|
|
uint32_t curOffsetIdx = srcWidth - 1; \
|
|
uint32x4_t offset = vddupq_wb_u32(&curOffsetIdx, 1);
|
|
|
|
|
|
/* Macros intercepting X or XY mirroring */
|
|
#define IS_PAVING_DIRECT_LOAD_PATTERN(sz) (0)
|
|
#define IS_PAVING_X_MIRROR_LOAD_PATTERN(sz) ((sz == 8)? 1: 0)
|
|
#define IS_PAVING_Y_MIRROR_LOAD_PATTERN(sz) (0)
|
|
#define IS_PAVING_XY_MIRROR_LOAD_PATTERN(sz) ((sz == 8)? 1: 0)
|
|
|
|
|
|
/* handle c8bit X or XY 2x2 mirroring when tile width > 256 */
|
|
/* uses 8-bit widened load allowing width up to 65K */
|
|
#define __ARM_2D_PAVING_2x2_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR, \
|
|
SETUP_MIRROR, LOAD, ...) \
|
|
{ \
|
|
/* row iteration */ \
|
|
/* handle pair of source image row and fill in the column direction */ \
|
|
do { \
|
|
uint32_t rowCnt = 0; \
|
|
\
|
|
pSourceBaseCur = pSourceBase; \
|
|
\
|
|
/* single source row loop */ \
|
|
do { \
|
|
uint8_t *pDst = (uint8_t*)pTargetBaseCur; \
|
|
uint32_t tilePairColsCnt = tilePairCols; \
|
|
\
|
|
/* column loop */ \
|
|
/* duplicate current source row into 2 x 2 destination across */ \
|
|
/*destination columns */ \
|
|
/* \
|
|
* +-------+ \
|
|
* |xxxxxxx| \
|
|
* | src | \
|
|
* +-------+ \
|
|
* \
|
|
* <---------------> \
|
|
* \
|
|
* || | || | || \
|
|
* * +========+=======+========+=======+ \
|
|
* | ||xxxxxxx|xxxxxxx||xxxxxxx|xxxxxxx||... \
|
|
* | || | || | ||... \
|
|
* | +--------+-------+--------+-------+-... \
|
|
* | ||xxxxxxx|xxxxxxx||xxxxxxx|xxxxxxx||... \
|
|
* | || | || | ||... \
|
|
* * +========+=======+========+=======+ \
|
|
* || | || | || \
|
|
*/ \
|
|
do { \
|
|
int32_t dstColCnt = srcWidth; \
|
|
const uint8_t *pSource = (const uint8_t*)pSourceBaseCur \
|
|
+ SRC_OFFSET; \
|
|
uint8_t *pDst00 = (uint8_t*)pDst; \
|
|
uint8_t *pDst01 = (uint8_t*)pDst00 + srcWidth; \
|
|
uint8_t *pDst10 = (uint8_t*)pDst + srcHeight * \
|
|
iTargetStride; \
|
|
uint8_t *pDst11 = (uint8_t*)pDst10 + srcWidth; \
|
|
\
|
|
uint32_t curOffsetIdx = srcWidth - 1; \
|
|
uint16x8_t offset = vddupq_wb_u16(&curOffsetIdx, 1); \
|
|
\
|
|
/* duplicate current source line into 2x2 destinations */ \
|
|
do { \
|
|
mve_pred16_t p = vctp16q(dstColCnt); \
|
|
uint8x16_t in; \
|
|
in = vldrbq_gather_offset_z_u16(pSource, offset, p); \
|
|
offset = vddupq_x_wb_u16(&curOffsetIdx, 1, p); \
|
|
\
|
|
/* placeholder for color masking */ \
|
|
__VA_ARGS__; \
|
|
\
|
|
vstrbq_p_u16(pDst00, in, p); \
|
|
vstrbq_p_u16(pDst01, in, p); \
|
|
vstrbq_p_u16(pDst10, in, p); \
|
|
vstrbq_p_u16(pDst11, in, p); \
|
|
\
|
|
pDst00 += 8; \
|
|
pDst01 += 8; \
|
|
pDst10 += 8; \
|
|
pDst11 += 8; \
|
|
dstColCnt -= 8; \
|
|
} \
|
|
while (dstColCnt > 0); \
|
|
\
|
|
pDst += 2 * srcWidth; \
|
|
tilePairColsCnt--; \
|
|
} \
|
|
while (tilePairColsCnt != 0); \
|
|
\
|
|
rowCnt ++; \
|
|
pTargetBaseCur += iTargetStride; \
|
|
pSourceBaseCur += (iSourceStride * DIR); \
|
|
} \
|
|
while (rowCnt < ptSourceSize->iHeight); \
|
|
\
|
|
pTargetBaseCur += srcHeight * iTargetStride; \
|
|
tilePairRows--; \
|
|
} \
|
|
while (tilePairRows != 0); \
|
|
}
|
|
|
|
|
|
|
|
#define __ARM_2D_PAVING_2x2(sz, SRC_OFFSET, DIR, SETUP_MIRROR, LOAD, ...) \
|
|
{ \
|
|
uint32_t srcWidth = ptSourceSize->iWidth; \
|
|
uint32_t srcHeight = ptSourceSize->iHeight; \
|
|
const ARM_PIX_SCLTYP(sz) *pSourceBaseCur; \
|
|
ARM_PIX_SCLTYP(sz) *pTargetBaseCur = pTargetBase; \
|
|
\
|
|
if ((sz == 8) && (srcWidth >= 256) && IS_##LOAD) { \
|
|
/* special case for 8-bit and X & XY mirror */ \
|
|
/* width does not fit in 8-bit, need widening */ \
|
|
/* will be optimized away for all other cases */ \
|
|
/* not executed unconditionally as slower */ \
|
|
__ARM_2D_PAVING_2x2_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR, \
|
|
SETUP_MIRROR, LOAD, __VA_ARGS__) \
|
|
} else \
|
|
/* row iteration */ \
|
|
/* handle pair of source image row and fill in the column direction */ \
|
|
do { \
|
|
uint32_t rowCnt = 0; \
|
|
\
|
|
pSourceBaseCur = pSourceBase; \
|
|
\
|
|
/* single source row loop */ \
|
|
do { \
|
|
ARM_PIX_SCLTYP(sz) *pDst = pTargetBaseCur; \
|
|
uint32_t tilePairColsCnt = tilePairCols; \
|
|
\
|
|
/* column loop */ \
|
|
/* duplicate current source row into 2 x 2 destination across */ \
|
|
/*destination columns */ \
|
|
/* \
|
|
* +-------+ \
|
|
* |xxxxxxx| \
|
|
* | src | \
|
|
* +-------+ \
|
|
* \
|
|
* <---------------> \
|
|
* \
|
|
* || | || | || \
|
|
* * +========+=======+========+=======+ \
|
|
* | ||xxxxxxx|xxxxxxx||xxxxxxx|xxxxxxx||... \
|
|
* | || | || | ||... \
|
|
* | +--------+-------+--------+-------+-... \
|
|
* | ||xxxxxxx|xxxxxxx||xxxxxxx|xxxxxxx||... \
|
|
* | || | || | ||... \
|
|
* * +========+=======+========+=======+ \
|
|
* || | || | || \
|
|
*/ \
|
|
do { \
|
|
int32_t dstColCnt = srcWidth; \
|
|
const ARM_PIX_SCLTYP(sz) *pSource = pSourceBaseCur \
|
|
+ SRC_OFFSET; \
|
|
ARM_PIX_SCLTYP(sz) *pDst00 = pDst; \
|
|
ARM_PIX_SCLTYP(sz) *pDst01 = pDst00 + srcWidth; \
|
|
ARM_PIX_SCLTYP(sz) *pDst10 = pDst + srcHeight * iTargetStride; \
|
|
ARM_PIX_SCLTYP(sz) *pDst11 = pDst10 + srcWidth; \
|
|
\
|
|
SETUP_MIRROR(srcWidth); \
|
|
\
|
|
/* duplicate current source line into 2x2 destinations */ \
|
|
do { \
|
|
mve_pred16_t p = \
|
|
ARM_CONNECT2(ARM_CONNECT2(vctp, sz),q)(dstColCnt); \
|
|
ARM_PIX_VECTYP(sz) in; \
|
|
\
|
|
in = LOAD(pSource, offset); \
|
|
/* placeholder for color masking */ \
|
|
__VA_ARGS__; \
|
|
\
|
|
vst1q_p(pDst00, in, p); \
|
|
vst1q_p(pDst01, in, p); \
|
|
vst1q_p(pDst10, in, p); \
|
|
vst1q_p(pDst11, in, p); \
|
|
\
|
|
pDst00 += ARM_PIX_VECELT(sz); \
|
|
pDst01 += ARM_PIX_VECELT(sz); \
|
|
pDst10 += ARM_PIX_VECELT(sz); \
|
|
pDst11 += ARM_PIX_VECELT(sz); \
|
|
dstColCnt -= ARM_PIX_VECELT(sz); \
|
|
} \
|
|
while (dstColCnt > 0); \
|
|
\
|
|
pDst += 2 * srcWidth; \
|
|
tilePairColsCnt--; \
|
|
} \
|
|
while (tilePairColsCnt != 0); \
|
|
\
|
|
rowCnt ++; \
|
|
pTargetBaseCur += iTargetStride; \
|
|
pSourceBaseCur += (iSourceStride * DIR); \
|
|
} \
|
|
while (rowCnt < ptSourceSize->iHeight); \
|
|
\
|
|
pTargetBaseCur += srcHeight * iTargetStride; \
|
|
tilePairRows--; \
|
|
} \
|
|
while (tilePairRows != 0); \
|
|
}
|
|
|
|
|
|
/* handle c8bit X or XY 1x2 mirroring when tile width > 256 */
|
|
/* uses 8-bit widened load allowing width up to 65K */
|
|
#define __ARM_2D_PAVING_1x2_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR, \
|
|
SETUP_MIRROR, LOAD, ...) \
|
|
{ \
|
|
/* row iteration */ \
|
|
/* handle pair of source image row and fill in the column direction */ \
|
|
do { \
|
|
uint32_t rowCnt = 0; \
|
|
\
|
|
pSourceBaseCur = pSourceBase; \
|
|
\
|
|
/* single source row loop */ \
|
|
do { \
|
|
int32_t dstColCnt = destWidth; \
|
|
const uint8_t *pSource = (const uint8_t*)pSourceBaseCur \
|
|
+ SRC_OFFSET; \
|
|
uint8_t *pDst = (uint8_t*)pTargetBaseCur; \
|
|
uint8_t *pDst00 = pDst; \
|
|
uint8_t *pDst10 = pDst + srcHeight * iTargetStride; \
|
|
\
|
|
/* column loop */ \
|
|
/* duplicate current source row into 1 x 2 destination across */ \
|
|
/* destination columns */ \
|
|
/* \
|
|
* +-------+ \
|
|
* |xxxxxxx| \
|
|
* | src | \
|
|
* +-------+ \
|
|
* \
|
|
* ..|| || \
|
|
* * ==+========+=... \
|
|
* | ..||xxxxxxx||... \
|
|
* | ..|| ||... \
|
|
* | ..+--------+-... \
|
|
* | ..||xxxxxxx||... \
|
|
* | ..|| ||... \
|
|
* * ==+========+=... \
|
|
* ..|| ||... \
|
|
*/ \
|
|
uint32_t curOffsetIdx = srcWidth - 1; \
|
|
uint16x8_t offset = vddupq_wb_u16(&curOffsetIdx, 1); \
|
|
\
|
|
/* duplicate current source line into 2x2 destinations */ \
|
|
do { \
|
|
mve_pred16_t p = vctp16q(dstColCnt); \
|
|
uint8x16_t in; \
|
|
in = vldrbq_gather_offset_z_u16(pSource, offset, p); \
|
|
offset = vddupq_x_wb_u16(&curOffsetIdx, 1, p); \
|
|
\
|
|
/* placeholder for color masking */ \
|
|
__VA_ARGS__; \
|
|
\
|
|
vstrbq_p_u16(pDst00, in, p); \
|
|
vstrbq_p_u16(pDst10, in, p); \
|
|
\
|
|
pDst00 += 8; \
|
|
pDst10 += 8; \
|
|
dstColCnt -= 8; \
|
|
} \
|
|
while (dstColCnt > 0); \
|
|
\
|
|
rowCnt ++; \
|
|
pTargetBaseCur += iTargetStride; \
|
|
pSourceBaseCur += (iSourceStride * DIR); \
|
|
} \
|
|
while (rowCnt < ptSourceSize->iHeight); \
|
|
\
|
|
pTargetBaseCur += srcHeight * iTargetStride; \
|
|
tilePairRows--; \
|
|
} \
|
|
while (tilePairRows != 0); \
|
|
}
|
|
|
|
|
|
#define __ARM_2D_PAVING_1x2(sz, SRC_OFFSET, DIR, SETUP_MIRROR, LOAD, ...) \
|
|
{ \
|
|
uint32_t srcWidth = ptSourceSize->iWidth; \
|
|
uint32_t srcHeight = ptSourceSize->iHeight; \
|
|
const ARM_PIX_SCLTYP(sz) *pSourceBaseCur; \
|
|
ARM_PIX_SCLTYP(sz) *pTargetBaseCur = pTargetBase; \
|
|
\
|
|
if ((sz == 8) && (srcWidth >= 256) && IS_##LOAD) { \
|
|
/* special case for 8-bit and X & XY mirror */ \
|
|
/* width does not fit in 8-bit, need widening */ \
|
|
/* will be optimized away for all other cases */ \
|
|
/* not executed unconditionally as slower */ \
|
|
__ARM_2D_PAVING_1x2_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR, \
|
|
SETUP_MIRROR, LOAD, __VA_ARGS__) \
|
|
} else \
|
|
/* row iteration */ \
|
|
/* handle pair of source image row and fill in the column direction */ \
|
|
do { \
|
|
uint32_t rowCnt = 0; \
|
|
\
|
|
pSourceBaseCur = pSourceBase; \
|
|
\
|
|
/* single source row loop */ \
|
|
do { \
|
|
int32_t dstColCnt = destWidth; \
|
|
const ARM_PIX_SCLTYP(sz) *pSource = pSourceBaseCur + SRC_OFFSET; \
|
|
ARM_PIX_SCLTYP(sz) *pDst = pTargetBaseCur; \
|
|
ARM_PIX_SCLTYP(sz) *pDst00 = pDst; \
|
|
ARM_PIX_SCLTYP(sz) *pDst10 = pDst + srcHeight * iTargetStride; \
|
|
\
|
|
/* column loop */ \
|
|
/* duplicate current source row into 1 x 2 destination across */ \
|
|
/* destination columns */ \
|
|
/* \
|
|
* +-------+ \
|
|
* |xxxxxxx| \
|
|
* | src | \
|
|
* +-------+ \
|
|
* \
|
|
* ..|| || \
|
|
* * ==+========+=... \
|
|
* | ..||xxxxxxx||... \
|
|
* | ..|| ||... \
|
|
* | ..+--------+-... \
|
|
* | ..||xxxxxxx||... \
|
|
* | ..|| ||... \
|
|
* * ==+========+=... \
|
|
* ..|| ||... \
|
|
*/ \
|
|
SETUP_MIRROR(srcWidth); \
|
|
\
|
|
/* duplicate current source line into 2x2 destinations */ \
|
|
do { \
|
|
mve_pred16_t p = \
|
|
ARM_CONNECT2(ARM_CONNECT2(vctp, sz),q)(dstColCnt); \
|
|
ARM_PIX_VECTYP(sz) in; \
|
|
\
|
|
in = LOAD(pSource, offset); \
|
|
/* placeholder for color masking */ \
|
|
__VA_ARGS__; \
|
|
\
|
|
vst1q_p(pDst00, in, p); \
|
|
vst1q_p(pDst10, in, p); \
|
|
\
|
|
pDst00 += ARM_PIX_VECELT(sz); \
|
|
pDst10 += ARM_PIX_VECELT(sz); \
|
|
dstColCnt -= ARM_PIX_VECELT(sz); \
|
|
} \
|
|
while (dstColCnt > 0); \
|
|
\
|
|
rowCnt ++; \
|
|
pTargetBaseCur += iTargetStride; \
|
|
pSourceBaseCur += (iSourceStride * DIR); \
|
|
} \
|
|
while (rowCnt < ptSourceSize->iHeight); \
|
|
\
|
|
pTargetBaseCur += srcHeight * iTargetStride; \
|
|
tilePairRows--; \
|
|
} \
|
|
while (tilePairRows != 0); \
|
|
}
|
|
|
|
/* handle c8bit X or XY 2x1 mirroring when tile width > 256 */
|
|
/* uses 8-bit widened load allowing width up to 65K */
|
|
#define __ARM_2D_PAVING_2x1_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR, \
|
|
SETUP_MIRROR, LOAD, ...) \
|
|
{ \
|
|
\
|
|
/* row iteration */ \
|
|
/* handle pair of source image row and fill in the column direction */ \
|
|
/* \
|
|
* +-------+ \
|
|
* | src | \
|
|
* +-------+ \
|
|
* \
|
|
* tilePairCols \
|
|
* <---------------+--------........> \
|
|
* \
|
|
* +=======+=======++=======+=======++...... \
|
|
* || | || | || \
|
|
* +=======+=======++=======+=======++ \
|
|
* ........ \
|
|
*/ \
|
|
\
|
|
pSourceBaseCur = pSourceBase; \
|
|
\
|
|
/* copy 2 x 2 source image block */ \
|
|
do { \
|
|
uint8_t *pDst = (uint8_t*)pTargetBaseCur; \
|
|
int16_t tilePairColsCnt = tilePairCols; \
|
|
\
|
|
do { \
|
|
int srcColCnt = srcWidth; \
|
|
const uint8_t *pSource = (const uint8_t*)pSourceBaseCur \
|
|
+ SRC_OFFSET; \
|
|
uint8_t *pDst00 = pDst; \
|
|
uint8_t *pDst01 = pDst00 + srcColCnt; \
|
|
\
|
|
uint32_t curOffsetIdx = srcWidth - 1; \
|
|
uint16x8_t offset = vddupq_wb_u16(&curOffsetIdx, 1); \
|
|
\
|
|
/* duplicate current source line into 4 destinations */ \
|
|
do { \
|
|
mve_pred16_t p = vctp16q(srcColCnt); \
|
|
uint8x16_t in; \
|
|
in = vldrbq_gather_offset_z_u16(pSource, offset, p); \
|
|
offset = vddupq_x_wb_u16(&curOffsetIdx, 1, p); \
|
|
/* placeholder for color masking */ \
|
|
__VA_ARGS__; \
|
|
\
|
|
vstrbq_p_u16(pDst00, in, p); \
|
|
vstrbq_p_u16(pDst01, in, p); \
|
|
\
|
|
pDst00 += 8; \
|
|
pDst01 += 8; \
|
|
srcColCnt -= 8; \
|
|
} \
|
|
while ((int32_t) srcColCnt > 0); \
|
|
\
|
|
pDst += 2 * srcWidth; \
|
|
tilePairColsCnt--; \
|
|
} \
|
|
while (tilePairColsCnt != 0); \
|
|
\
|
|
rowCnt++; \
|
|
pTargetBaseCur += iTargetStride; \
|
|
pSourceBaseCur += (iSourceStride * DIR); \
|
|
} \
|
|
while (rowCnt < destHeight); \
|
|
}
|
|
|
|
|
|
#define __ARM_2D_PAVING_2x1(sz, SRC_OFFSET, DIR, SETUP_MIRROR, LOAD, ...) \
|
|
{ \
|
|
uint32_t srcWidth = ptSourceSize->iWidth; \
|
|
const ARM_PIX_SCLTYP(sz) *pSourceBaseCur; \
|
|
ARM_PIX_SCLTYP(sz) *pTargetBaseCur = pTargetBase; \
|
|
uint32_t rowCnt = 0; \
|
|
\
|
|
if ((sz == 8) && (srcWidth >= 256) && IS_##LOAD) { \
|
|
/* special case for 8-bit and X & XY mirror */ \
|
|
/* width does not fit in 8-bit, need widening */ \
|
|
/* will be optimized away for all other cases */ \
|
|
/* not executed unconditionally as slower */ \
|
|
__ARM_2D_PAVING_2x1_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR, \
|
|
SETUP_MIRROR, LOAD, __VA_ARGS__) \
|
|
} else { \
|
|
/* row iteration */ \
|
|
/* handle pair of source image row and fill in the column direction */ \
|
|
/* \
|
|
* +-------+ \
|
|
* | src | \
|
|
* +-------+ \
|
|
* \
|
|
* tilePairCols \
|
|
* <---------------+--------........> \
|
|
* \
|
|
* +=======+=======++=======+=======++...... \
|
|
* || | || | || \
|
|
* +=======+=======++=======+=======++ \
|
|
* ........ \
|
|
*/ \
|
|
\
|
|
pSourceBaseCur = pSourceBase; \
|
|
\
|
|
/* copy 2 x 2 source image block */ \
|
|
do { \
|
|
ARM_PIX_SCLTYP(sz) *pDst = pTargetBaseCur; \
|
|
ARM_PIX_SCLTYP(sz) tilePairColsCnt = tilePairCols; \
|
|
\
|
|
do { \
|
|
int srcColCnt = srcWidth; \
|
|
const ARM_PIX_SCLTYP(sz) *pSource = pSourceBaseCur + SRC_OFFSET; \
|
|
ARM_PIX_SCLTYP(sz) *pDst00 = pDst; \
|
|
ARM_PIX_SCLTYP(sz) *pDst01 = pDst00 + srcColCnt; \
|
|
\
|
|
SETUP_MIRROR(srcWidth); \
|
|
\
|
|
/* duplicate current source line into 4 destinations */ \
|
|
do { \
|
|
mve_pred16_t p = \
|
|
ARM_CONNECT2(ARM_CONNECT2(vctp, sz),q)(srcColCnt); \
|
|
ARM_PIX_VECTYP(sz) in; \
|
|
\
|
|
in = LOAD(pSource, offset); \
|
|
/* placeholder for color masking */ \
|
|
__VA_ARGS__; \
|
|
\
|
|
vst1q_p(pDst00, in, p); \
|
|
vst1q_p(pDst01, in, p); \
|
|
\
|
|
pDst00 += ARM_PIX_VECELT(sz); \
|
|
pDst01 += ARM_PIX_VECELT(sz); \
|
|
srcColCnt -= ARM_PIX_VECELT(sz); \
|
|
} \
|
|
while ((int32_t) srcColCnt > 0); \
|
|
\
|
|
pDst += 2 * srcWidth; \
|
|
tilePairColsCnt--; \
|
|
} \
|
|
while (tilePairColsCnt != 0); \
|
|
\
|
|
rowCnt++; \
|
|
pTargetBaseCur += iTargetStride; \
|
|
pSourceBaseCur += (iSourceStride * DIR); \
|
|
} \
|
|
while (rowCnt < destHeight); \
|
|
} \
|
|
}
|
|
|
|
|
|
/* handle c8bit X or XY 1x1 mirroring when tile width > 256 */
|
|
/* uses 8-bit widened load allowing width up to 65K */
|
|
#define __ARM_2D_PAVING_1x1_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR, \
|
|
SETUP_MIRROR, LOAD, ...) \
|
|
{ \
|
|
pSource += SRC_OFFSET; \
|
|
\
|
|
for (int_fast16_t y = 0; y < ptDstCopySize->iHeight; y++) { \
|
|
uint8_t *pDst = (uint8_t *)pTarget; \
|
|
const uint8_t *pSrc = (const uint8_t*)pSource; \
|
|
uint32_t dstWidth = ptDstCopySize->iWidth; \
|
|
uint32_t curOffsetIdx = srcWidth - 1; \
|
|
uint16x8_t offset = vddupq_wb_u16(&curOffsetIdx, 1); \
|
|
\
|
|
do { \
|
|
mve_pred16_t p = vctp16q(dstWidth); \
|
|
uint8x16_t in; \
|
|
in = vldrbq_gather_offset_z_u16(pSrc, offset, p); \
|
|
offset = vddupq_x_wb_u16(&curOffsetIdx, 1, p); \
|
|
/* placeholder for color masking */ \
|
|
__VA_ARGS__; \
|
|
\
|
|
vstrbq_p_u16(pDst, in, p); \
|
|
\
|
|
pDst += 8; \
|
|
dstWidth -= 8; \
|
|
} \
|
|
while ((int32_t) dstWidth > 0); \
|
|
\
|
|
pSource += (iSourceStride * DIR); \
|
|
pTarget += iTargetStride; \
|
|
} \
|
|
}
|
|
|
|
|
|
#define __ARM_2D_PAVING_1x1(sz, SRC_OFFSET, DIR, SETUP_MIRROR, LOAD, ...) \
|
|
{ \
|
|
uint32_t srcWidth = ptSrcCopySize->iWidth; \
|
|
if ((sz == 8) && (srcWidth >= 256) && IS_##LOAD) { \
|
|
/* special case for 8-bit and X & XY mirror */ \
|
|
/* width does not fit in 8-bit, need widening */ \
|
|
/* will be optimized away for all other cases */ \
|
|
/* not executed unconditionally as slower */ \
|
|
__ARM_2D_PAVING_1x1_8BIT_X_MIRROR_FIXUP(SRC_OFFSET, DIR, \
|
|
SETUP_MIRROR, LOAD, __VA_ARGS__) \
|
|
} else { \
|
|
pSource += SRC_OFFSET; \
|
|
\
|
|
for (int_fast16_t y = 0; y < ptDstCopySize->iHeight; y++) { \
|
|
ARM_PIX_SCLTYP(sz) *pDst = pTarget; \
|
|
ARM_PIX_SCLTYP(sz) *pSrc = pSource; \
|
|
uint32_t dstWidth = ptDstCopySize->iWidth; \
|
|
\
|
|
SETUP_MIRROR(srcWidth); \
|
|
\
|
|
do { \
|
|
mve_pred16_t p = \
|
|
ARM_CONNECT2(ARM_CONNECT2(vctp, sz), q) (dstWidth); \
|
|
ARM_PIX_VECTYP(sz) in; \
|
|
\
|
|
in = LOAD(pSrc, offset); \
|
|
/* placeholder for color masking */ \
|
|
__VA_ARGS__; \
|
|
\
|
|
vst1q_p(pDst, in, p); \
|
|
\
|
|
pDst += ARM_PIX_VECELT(sz); \
|
|
dstWidth -= ARM_PIX_VECELT(sz); \
|
|
} \
|
|
while ((int32_t) dstWidth > 0); \
|
|
\
|
|
pSource += (iSourceStride * DIR); \
|
|
pTarget += iTargetStride; \
|
|
} \
|
|
} \
|
|
}
|
|
|
|
/*============================ TYPES =========================================*/
|
|
/*============================ GLOBAL VARIABLES ==============================*/
|
|
/*============================ PROTOTYPES ====================================*/
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif
|