/*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/* ----------------------------------------------------------------------
 * Project:      Arm-2D Library
 * Title:        arm-2d_math_helium.h
 * Description:  Provides helium math routines
 *
 * $Date:        29. Sep 2021
 * $Revision:    V 0.0.2
 *
 * Target Processor:  Cortex-M cores with Helium
 *
 * -------------------------------------------------------------------- */


#ifndef __ARM_2D_MATH_HELIUM_H__
#define __ARM_2D_MATH_HELIUM_H__


#if defined(__ARM_2D_HAS_HELIUM__) && __ARM_2D_HAS_HELIUM__

/*============================ INCLUDES ======================================*/
#include "arm_2d.h"
#include <arm_math.h>

#if __ARM_2D_HAS_HELIUM_FLOAT__
#include <arm_math_f16.h>
#endif

#ifdef   __cplusplus
extern "C" {
#endif

#define vec_rgb8_t             uint8x16_t
#define vec_rgb16_t            uint16x8_t
#define vec_rgb32_t            uint32x4_t

#define ARM_PIX_SCLTYP(sz)     ARM_CONNECT2(ARM_CONNECT2(uint, sz), _t)
#define ARM_PIX_VECTYP(sz)     ARM_CONNECT2(ARM_CONNECT2(vec_rgb,sz), _t)

#define vld8                 vldrb
#define vld16                vldrh
#define vld32                vldrw
#define vst8                 vstrb
#define vst16                vstrh
#define vst32                vstrw

#define ARM_VLD_ASM(sz)       ARM_CONNECT2(vld, sz)
#define ARM_VST_ASM(sz)       ARM_CONNECT2(vst, sz)

#define ARM_VLD1_ASM(sz)      TO_STRING(ARM_VLD_ASM(sz).ARM_CONNECT2(u,sz))
#define ARM_VST1_ASM(sz)      TO_STRING(ARM_VST_ASM(sz).ARM_CONNECT2(u,sz))
#define ARM_VLD1Z_ASM(sz)     TO_STRING(ARM_VLD_ASM(sz)t.ARM_CONNECT2(u,sz))
#define ARM_VST1P_ASM(sz)     TO_STRING(ARM_VST_ASM(sz)t.ARM_CONNECT2(u,sz))
#define ARM_VLDWID_ASM(sz, wid)    TO_STRING(ARM_VLD_ASM(sz).ARM_CONNECT2(u,wid))
#define ARM_VSTNRW_ASM(sz, nrw)    TO_STRING(ARM_VLD_ASM(sz).ARM_CONNECT2(u,nrw))


/* number of vector elements */
#define ARM_PIX_VECELT(sz)     (128/sz)

/*============================ MACROS ========================================*/
/*============================ MACROFIED FUNCTIONS ===========================*/


#if __ARM_2D_HAS_HELIUM_FLOAT__

#define __PIF16                3.14159265358979F16
#define __CALIBF16             0.009f16
/* avoid 0 division */
/* should not be too small as inverse could grows to F16 infinite 65e3 when */
/* multiplied with numerator */
#define __EPSF16               1e-03f16

#define __LARGEINVF32          100.0f

#define __PI_2_F16             (__PIF16/2.0f16)
#define INV_NEWTON_INIT_F16  0x7773

#define INFINITY_F16           ((float16_t)__builtin_inf())


#define ATANF_LUT                   \
	-0.0443265554792128,	/*p7*/  \
	-0.3258083974640975,	/*p3*/  \
	+0.1555786518463281,	/*p5*/  \
	+0.9997878412794807  	/*p1*/  \

extern const float16_t sinTable_f16[FAST_MATH_TABLE_SIZE + 1];
extern const float16_t atanf_lut_f16[4];

#define INVSQRT_MAGIC_F16           0x59ba      /*  ( 0x1ba = 0x3759df >> 13) */

/* 2D point floating point vector types */
typedef struct arm_2d_point_f32x4_t {
    float32x4_t X;
    float32x4_t Y;
} arm_2d_point_f32x4_t;

typedef struct arm_2d_point_f16x8_t {
    float16x8_t     X;
    float16x8_t     Y;
} arm_2d_point_f16x8_t;

#endif

/* 2D point integer point vector types */
typedef struct arm_2d_point_s16x8_t {
    int16x8_t       X;
    int16x8_t       Y;
} arm_2d_point_s16x8_t;

typedef struct arm_2d_point_s32x4_t {
    int32x4_t       X;
    int32x4_t       Y;
} arm_2d_point_s32x4_t;


#if __ARM_2D_HAS_HELIUM_FLOAT__ == 1

__STATIC_FORCEINLINE  float16x8_t vsin_f16(
    float16x8_t vecIn)
{
    q15x8_t         vecMask = vdupq_n_s16(0x1ff);
    float16x8_t     vecOne = vdupq_n_f16(1.0f16);
    float16x8_t     vecTmpFlt0, vecTmpFlt1;
    float16x8_t     vecSin0, vecSin1;
    q15x8_t         vecTmpFx;


    /*
     * input x is in radians
     * Scale the input to [0 1] range from [0 2*PI]
     * divide input by 2*pi (add 0.25 (pi/2) to read sine table)
     * in = x * 0.159154943092f + 0.25f
     */
    vecTmpFlt0 = vmulq(vecIn, (float16_t) (1.0 / (2.0 * PI)));
    /*
     * Calculation of floor value of input
     */
    vecTmpFx = vcvtmq_s16_f16(vecTmpFlt0);
    vecTmpFlt1 = vcvtq_f16_s16(vecTmpFx);
    /*
     * Map input value to [0 1]
     */
    vecTmpFlt1 = vecTmpFlt0 - vecTmpFlt1;
    /*
     * Calculation of index of the table
     * findex = (float16_t) FAST_MATH_TABLE_SIZE * in;
     */
    vecTmpFlt0 = vecTmpFlt1 * (float16_t) FAST_MATH_TABLE_SIZE;
    /*
     * index = ((uint16_t)findex) & 0x1ff;
     */
    vecTmpFx = vcvtq_s16_f16(vecTmpFlt0);
    vecTmpFx = vandq(vecTmpFx, vecMask);
    /*
     * fractional value calculation
     * fract = findex - (float16_t) index;
     */
    vecTmpFlt1 = vcvtq_f16_s16(vecTmpFx);
    vecTmpFlt0 = vecTmpFlt0 - vecTmpFlt1;
    /*
     * Read two nearest values of input value from the cos table
     * a = sinTable_f16[index];
     */
    vecSin0 = vldrhq_gather_shifted_offset_f16(sinTable_f16, vecTmpFx);
    /*
     * b = sinTable_f16[index+1];
     */
    vecTmpFx = vecTmpFx + 1;
    vecSin1 = vldrhq_gather_shifted_offset_f16(sinTable_f16, vecTmpFx);
    /*
     * 1.0f - fract
     */
    vecTmpFlt1 = vecOne - vecTmpFlt0;
    /*
     * fract * b;
     */
    vecTmpFlt0 = vecTmpFlt0 * vecSin1;
    /*
     * Linear interpolation process
     * cosVal = (1.0f-fract)*a + fract*b;
     */
    vecTmpFlt0 = vfmaq(vecTmpFlt0, vecSin0, vecTmpFlt1);

    // set 'tiny' values to x (F16 precision limit)
    vecTmpFlt0 = vpselq(vecIn, vecTmpFlt0, vcmpleq(vabsq(vecIn), 0.05f16));

    return vecTmpFlt0;
}


__STATIC_FORCEINLINE float16x8_t vcos_f16(
    float16x8_t x)
{
    return vsin_f16(x + __PI_2_F16);
}


/* fast inverse approximation (4x newton) */
__STATIC_INLINE float16x8_t vrecip_hiprec_f16(
    float16x8_t x)
{
    q15x8_t         m;
    float16x8_t         b;
    any16x8_t       xinv;
    float16x8_t         ax = vabsq(x);

    xinv.f = ax;

    m = 0x03c00 - (xinv.i & 0x07c00);
    xinv.i = xinv.i + m;
    xinv.f = 1.41176471f16 - 0.47058824f16 * xinv.f;
    xinv.i = xinv.i + m;

    b = 2.0f16 - xinv.f * ax;
    xinv.f = xinv.f * b;

    b = 2.0f16 - xinv.f * ax;
    xinv.f = xinv.f * b;

    b = 2.0f16 - xinv.f * ax;
    xinv.f = xinv.f * b;

    b = 2.0f16 - xinv.f * ax;
    xinv.f = xinv.f * b;

    xinv.f = vdupq_m(xinv.f, INFINITY_F16, vcmpeqq(x, 0.0f));
    /*
     * restore sign
     */
    xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));

    return xinv.f;
}

__STATIC_FORCEINLINE float16x8_t vrecip_lowprec_f16(float16x8_t vecIn)
{
    float16x8_t       vecSx, vecW;
    any16x8_t     r;

    vecSx = vabsq(vecIn);
    r.f = vecSx;

    r.i = vdupq_n_u16(INV_NEWTON_INIT_F16) - r.i;

    vecW = vmulq(vecSx, r.f);
    vecW = vsubq(vdupq_n_f16((float16_t)2), vecW);

    r.f = vmulq(r.f, vecW);

    r.f = vdupq_m(r.f, (float16_t)INFINITY, vcmpeqq(vecIn, (float16_t)0));
    /*
    * restore sign
    */
    r.f = vnegq_m(r.f, r.f, vcmpltq_n_f16(vecIn, (float16_t)0));

    return r.f;
}


__STATIC_FORCEINLINE float16x8_t vatan_f16(
    float16x8_t x)
{
    float16x8_t         a, b, r;
    float16x8_t         xx;
    any16x8_t           xinv , ax;

    ax.f = vabsq(x);
    //fast inverse approximation (2x newton)
    xinv.f = vrecip_lowprec_f16( ax.f );

    //if |x| > 1.0 -> ax = -1/ax, r = pi/2
    xinv.f = xinv.f + ax.f;

    // a = (c > 1.0f);
    a = vdupq_n_f16(0.0f16);
    a = vdupq_m(a, 1.0f16, vcmpgtq(ax.f, 1.0f16));

    ax.f = ax.f - a * xinv.f;
    r = a * (float16_t)__PI_2_F16;

    //polynomial evaluation
    xx = ax.f * ax.f;
    a = (atanf_lut_f16[0] * ax.f) * xx + (atanf_lut_f16[2] * ax.f);
    b = (atanf_lut_f16[1] * ax.f) * xx + (atanf_lut_f16[3] * ax.f);
    xx = xx * xx;
    b = b + a * xx;
    r = r + b;

    //if x < 0 -> r = -r
    r = vnegq_m(r, r, vcmpltq(x, 0.0f16));
    // set 'tiny' values to x (F16 precision limit)
    r = vpselq(x, r, vcmpleq(vabsq(x), 0.05f16));

    return r;
}


__STATIC_INLINE float16x8_t vdiv_f16(
    float16x8_t num, float16x8_t den)
{
    return vmulq(num, vrecip_hiprec_f16(den));
}


__STATIC_FORCEINLINE float16x8_t visqrtf_f16(
    float16x8_t vecIn)
{
    int16x8_t       vecNewtonInit = vdupq_n_s16(INVSQRT_MAGIC_F16);
    float16x8_t     vecOneHandHalf = vdupq_n_f16(1.5f16);
    float16x8_t     vecHalf;
    int16x8_t       vecTmpInt;
    float16x8_t     vecTmpFlt, vecTmpFlt1;


    vecHalf = vmulq(vecIn, (float16_t) 0.5f16);
    /*
     * cast input float vector to integer and right shift by 1
     */
    vecTmpInt = vshrq((int16x8_t) vecIn, 1);
    /*
     * INVSQRT_MAGIC - ((vec_q16_t)vecIn >> 1)
     */
    vecTmpInt = vsubq(vecNewtonInit, vecTmpInt);

    /*
     *------------------------------
     * 1st iteration
     *------------------------------
     * (1.5f-xhalf*x*x)
     */
    vecTmpFlt1 = vmulq((float16x8_t) vecTmpInt, (float16x8_t) vecTmpInt);
    vecTmpFlt1 = vmulq(vecTmpFlt1, vecHalf);
    vecTmpFlt1 = vsubq(vecOneHandHalf, vecTmpFlt1);
    /*
     * x = x*(1.5f-xhalf*x*x);
     */
    vecTmpFlt = vmulq((float16x8_t) vecTmpInt, vecTmpFlt1);

    /*
     *------------------------------
     * 2nd iteration
     *------------------------------
     */
    vecTmpFlt1 = vmulq(vecTmpFlt, vecTmpFlt);
    vecTmpFlt1 = vmulq(vecTmpFlt1, vecHalf);
    vecTmpFlt1 = vsubq(vecOneHandHalf, vecTmpFlt1);
    vecTmpFlt = vmulq(vecTmpFlt, vecTmpFlt1);

    /*
     * set negative values to nan
     */
    vecTmpFlt = vdupq_m(vecTmpFlt, (float16_t) NAN, vcmpltq(vecIn, (float16_t) 0.0f16));
    vecTmpFlt =
        vdupq_m(vecTmpFlt, (float16_t) INFINITY, vcmpeqq(vecIn, (float16_t) 0.0f16));

    return vecTmpFlt;
}


__STATIC_FORCEINLINE float16x8_t vsqrtf_f16(
    float16x8_t vecIn)
{
    float16x8_t         vecDst;

    /* inverse square root unsing 2 newton iterations */
    vecDst = visqrtf_f16(vecIn);
    vecDst = vdupq_m(vecDst, 0.0f, vcmpeqq(vecIn, 0.0f));
    vecDst = vecDst * vecIn;
    return vecDst;
}

#endif

/*============================ TYPES =========================================*/
/*============================ GLOBAL VARIABLES ==============================*/
/*============================ PROTOTYPES ====================================*/

#ifdef   __cplusplus
}
#endif

#endif // (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI)

#endif // __ARM_2D_MATH_HELIUM_H__