/*********************************************************************************************/
/* FILE NAME: FIR_Transparent_Intrinsic.c    COPYRIGHT (c) Freescale 2015   */
/*                                                      All Rights Reserved  						        */
/* DESCRIPTION: 10 Tap LSP Filter  using intrinsics								        */
/*                                                                                             				        */
/*********************************************************************************************/	
/* REV      AUTHOR        DATE        DESCRIPTION OF CHANGE 			         */
/* ---   -----------    ----------    ---------------------                                     		             */
/* 1.0	  A Turner   			April 2015   Initial Public Release        		             */
/**********************************************************************************************/

#include "dsp_filter.h"
#include <lsp.h>
#include "../headers/typedefs.h"



/*----------------------------------------------------------------------------------------------*/
/* FIR input, output vectors and coeficcients */   
 // N - number of output samples
     // ntaps - number of filter coefficients
     // x - input array of size N+ntaps-1
     // y - output array of size N
     // hr - array of coefficients of length ntaps,
     // coefficients are stored in reversed order */
 
//fir_frac16_c(uint16_t N, uint16_t ntaps,uint16_t *x,uint16_t *y, int16_t *h)
#pragma ghs section text=".vle_imem"

void fir_frac16_LSP__trans_intrinsic(uint16_t N, int16_t *x_ptr, int16_t *y_ptr, int16_t *h_ptr)
{


__lsp32_sf16__ InputVector0, InputVector1, InputVector2, InputVector3, InputVector4, InputVector5;

__lsp32_sf16__ Coeff1, Coeff2, Coeff3, Coeff4, Coeff5, Coeff6, Coeff7, Coeff8, Coeff9, Coeff10;

__lsp64_32__ Accumulating_Product;
int n = 0;


//--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
//------------------------------------------------ STAGE 1 : LOADING THE CO-EFFICIENTS --------------------------------------------------------------------------------------------------
// -------------------    Option A   Load the coefficients in to registers using direct LSP equivalent instructions -----------------------------------------------------------
// If option A is to be used then comment out Option B ------------------------------------------------------------------------------------------------------------------------------------

#ifdef Full_Intrinsics

Coeff1 =  __zlhhsplat(h_ptr, 0); 
Coeff2 =  __zlhhsplatu(h_ptr, 2); 
Coeff3 =  __zlhhsplatu(h_ptr, 4); 
Coeff4 =  __zlhhsplatu(h_ptr, 6); 
Coeff5 =  __zlhhsplatu(h_ptr, 8); 
Coeff6 =  __zlhhsplatu(h_ptr, 10); 
Coeff7 =  __zlhhsplatu(h_ptr, 12); 
Coeff8 =  __zlhhsplatu(h_ptr, 14); 
Coeff9 =  __zlhhsplatu(h_ptr, 16); 
Coeff10 =  __zlhhsplatu(h_ptr, 18); 


// //-----------------------  Option B    Load the co-efficients using LSP create intrinsics --------------------------------------------------------------------------------------------

#else
//example usage of create intrinsics
//__lsp32_16__    __zlhhsplat         __ARGS(( void          * a, __lsp5_uimm__   b )) __ATTRIBUTE((pure));
//__lsp64_32__    __zvmhulsf          __ARGS(( __lsp32_16__    a, __lsp32_16__    b )) __ATTRIBUTE((const,pure));
Coeff1  = __lsp_create_32_16(h_ptr[0], h_ptr[0]);
Coeff2  = __lsp_create_32_16(h_ptr[1], h_ptr[1]);
Coeff3  = __lsp_create_32_16(h_ptr[2], h_ptr[2]);
Coeff4  = __lsp_create_32_16(h_ptr[3], h_ptr[3]);
Coeff5  = __lsp_create_32_16(h_ptr[4], h_ptr[4]);
Coeff6  = __lsp_create_32_16(h_ptr[5], h_ptr[5]);
Coeff7  = __lsp_create_32_16(h_ptr[6], h_ptr[6]);
Coeff8  = __lsp_create_32_16(h_ptr[7], h_ptr[7]);
Coeff9  = __lsp_create_32_16(h_ptr[8], h_ptr[8]);
Coeff10  = __lsp_create_32_16(h_ptr[9], h_ptr[9]);

#endif

//#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
//#------------------------------------------------          : LOADING THE INITIAL DATA--------------------------------------------------------------------------------------------------------


// Option A: Load the first set of data to be processed using direct LSP equivalent instructions ---------------------------------------------------------------------------
// If option A is to be used then comment out Option B ------------------------------------------------------------------------------------------------------------------------------------
#ifdef Full_Intrinsics
 InputVector0 = __zlww(x_ptr,0);
 InputVector1 = __zlwwu(x_ptr,4);
 InputVector2 = __zlwwu(x_ptr,8);
 InputVector3 = __zlwwu(x_ptr,12);
 InputVector4 = __zlwwu(x_ptr,16);
#else
// //Option B: Load the first set of data to be processed using LSP equivalent instructions ---------------------------------------------------------------------------------------
 // //Load the first set of data to be processed
InputVector0  = __lsp_create_32_16(x_ptr[n+0], x_ptr[n+1]) ;
InputVector1  = __lsp_create_32_16(x_ptr[n+2], x_ptr[n+3]) ;
InputVector2  = __lsp_create_32_16(x_ptr[n+4], x_ptr[n+5]) ;
InputVector3  = __lsp_create_32_16(x_ptr[n+6], x_ptr[n+7]) ;
InputVector4  = __lsp_create_32_16(x_ptr[n+8], x_ptr[n+9]) ; 
  
// //if defined LSP instructions  
#endif


for (n=0; n<N; n+=2)
{ 

#ifdef Full_Intrinsics
InputVector5  = __zlwwu(x_ptr,16);
x_ptr += 2;
#else
InputVector5  = __lsp_create_32_16(x_ptr[n+10],x_ptr[n+11]) ;
#endif

//------------------------------------------------ STAGE 2 : MAC THE ODD COEFFICIENTS----------------------------------------------------------------------------------------------------
  
Accumulating_Product = __zvmhulsf(InputVector0, Coeff10) ;
Accumulating_Product = __zvmhulsfaas(Accumulating_Product, InputVector1, Coeff8) ;
Accumulating_Product = __zvmhulsfaas(Accumulating_Product,InputVector2, Coeff6) ;
Accumulating_Product = __zvmhulsfaas(Accumulating_Product,InputVector3, Coeff4) ;
Accumulating_Product = __zvmhulsfaas(Accumulating_Product,InputVector4, Coeff2) ;

// ------------------------------------------------ STAGE 3 : MERGE THE INPUT VECTORS----------------------------------------------------------------------------------------------------
// ------------------- merge to rotate input data vectors so that the even coefficients multiply the corresponding delayed data ------------------------------------------

InputVector0 = __zvmergelohih(InputVector0, InputVector1); 
InputVector1 = __zvmergelohih(InputVector1, InputVector2); 
InputVector2 = __zvmergelohih(InputVector2, InputVector3); 
InputVector3 = __zvmergelohih(InputVector3, InputVector4); 
InputVector4 = __zvmergelohih(InputVector4, InputVector5);


// ------------------------------------------------ STAGE 4 : MAC THE EVEN COEFFICIENTS--------------------------------------------------------------------------------------------------
Accumulating_Product = __zvmhulsfaas(Accumulating_Product,InputVector0, Coeff9);
Accumulating_Product = __zvmhulsfaas(Accumulating_Product,InputVector1, Coeff7);
Accumulating_Product = __zvmhulsfaas(Accumulating_Product,InputVector2, Coeff5);
Accumulating_Product = __zvmhulsfaas(Accumulating_Product,InputVector3, Coeff3);
Accumulating_Product = __zvmhulsfaas(Accumulating_Product,InputVector4, Coeff1);

// ------------------------------------------------ STAGE 5 : STORE OUTPUT TO ARRAY--------------------------------------------------------------------------------------------------------

 __zstwhed  (Accumulating_Product, y_ptr,0);
 y_ptr =y_ptr+2;

// ------------------------------------------------ STAGE 6 : MERGE THE INPUT VECTORS-----------------------------------------------------------------------------------------------------

InputVector0 = __zvmergelohih(InputVector0, InputVector1); 
InputVector1 = __zvmergelohih(InputVector1, InputVector2); 
InputVector2 = __zvmergelohih(InputVector2, InputVector3); 
InputVector3 = __zvmergelohih(InputVector3, InputVector4); 
InputVector4 = __zvmergehiloh(InputVector5, InputVector5);
}
}
#pragma ghs section vletext=default
