// SISMath.cpp: implementation of the CSISMath class.
//
//////////////////////////////////////////////////////////////////////

#include "stdafx.h"
#include "SISMath.h"


#include <emmintrin.h>
#include <tmmintrin.h>
#include <omp.h>

#ifdef _DEBUG
#define new DEBUG_NEW
#undef THIS_FILE
static char THIS_FILE[] = __FILE__;
#endif

//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////

#define SIS_SIMD

#define MOMP_MAX_THREAD_SIZE 4

//#define SIS_ASM


int64 CSISMath::OMP_CCU8_A(const uchar * vec1, int len)
{
	int s = 0;
	int64 sum = 0;

	int64 sumT[MOMP_MAX_THREAD_SIZE] = {0};
#pragma omp parallel num_threads(MOMP_MAX_THREAD_SIZE)
	{
		int i= 0;
		int templen= len- 16;
		__m128i	mmResult, mmResult2;
		__m128i	mmA;
		__m128i	mmAA, mmBB;	
		__m128i mmZeroData= _mm_setzero_si128();

		int64 res64[2];
		mmResult= _mm_setzero_si128();
#pragma omp for private(i)
		for(i= 0; i <= templen; i+= 16)
		{
			mmA= _mm_load_si128((__m128i*) (vec1+ i));

			mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
			mmResult2= _mm_madd_epi16(mmAA, mmAA);
			mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
			mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
			mmResult= _mm_add_epi64(mmAA, mmResult);
			mmResult= _mm_add_epi64(mmBB, mmResult);

			mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
			mmResult2= _mm_madd_epi16(mmAA, mmAA);
			mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
			mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
			mmResult= _mm_add_epi64(mmAA, mmResult);
			mmResult= _mm_add_epi64(mmBB, mmResult);
		}
		_mm_storeu_si128((__m128i*)res64, mmResult);
		sumT[omp_get_thread_num()]= res64[0]+ res64[1];
	}

	int i = 0;
	for (i = 0; i < MOMP_MAX_THREAD_SIZE; i++)
	{
		sum += sumT[i];
	}

	i= len- (len&15);
	for(; i < len; i++ )
	{
		s += vec1[i] * vec1[i];
	}

	return sum + s;
}

int64	CSISMath::OMP_CCU8_UA(const uchar *vec1, int len)
{
	int s = 0;
	int64 sum = 0;
	
	int64 sumT[MOMP_MAX_THREAD_SIZE];
#pragma omp parallel num_threads(MOMP_MAX_THREAD_SIZE)
	{
		int i = 0;
		int templen= len- 16;
		int64	res64[2];
		__m128i	mmResult, mmResult2;
		__m128i mmA, mmAA, mmBB;
		__m128i mmZeroData= _mm_setzero_si128();
		mmResult= mmZeroData;
#pragma omp for private(i)
		for(i= 0; i <= templen; i+= 16)
		{
			mmA= _mm_loadu_si128((__m128i*) (vec1+ i));

			mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
			mmResult2= _mm_madd_epi16(mmAA, mmAA);
			mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
			mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
			mmResult= _mm_add_epi64(mmAA, mmResult);
			mmResult= _mm_add_epi64(mmBB, mmResult);

			mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
			mmResult2= _mm_madd_epi16(mmAA, mmAA);
			mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
			mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
			mmResult= _mm_add_epi64(mmAA, mmResult);
			mmResult= _mm_add_epi64(mmBB, mmResult);
		}
		_mm_storeu_si128((__m128i*)res64, mmResult);
		sumT[omp_get_thread_num()]= res64[0]+ res64[1];
	}

	int i = 0;
	for(i= 0; i< MOMP_MAX_THREAD_SIZE; i++)
	{
		sum+= sumT[i];
	}


	i= len- (len&15);
	for(; i < len; i++ )
	{
		s += vec1[i] * vec1[i];
	}

	return sum+ s;
}

int64	CSISMath::OMP_CCU8_AUA( const uchar * vec1, const uchar * vec2, int len )
{
	int s = 0;
	int64 sum = 0;
	int64 temp[MOMP_MAX_THREAD_SIZE];
	

#pragma omp parallel num_threads(MOMP_MAX_THREAD_SIZE)
	{
		int i = 0;
		int	templen= len- 16;
		__m128i	mmResult, mmResult2;
		__m128i	mmA, mmB;
		__m128i	mmAA, mmBB;
		__m128i mmZeroData= _mm_setzero_si128();
		mmResult= _mm_setzero_si128();
		int64 res64[2];

#pragma omp for private(i)
		for(i= 0; i <= templen; i+= 16)
		{
			mmA= _mm_load_si128((__m128i*) (vec1+ i));
			mmB= _mm_loadu_si128((__m128i*) (vec2+ i));

			mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
			mmBB= _mm_unpackhi_epi8(mmB, mmZeroData);
			mmResult2= _mm_madd_epi16(mmAA, mmBB);
			mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
			mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
			mmResult= _mm_add_epi64(mmAA, mmResult);
			mmResult= _mm_add_epi64(mmBB, mmResult);

			mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
			mmBB= _mm_unpacklo_epi8(mmB, mmZeroData);
			mmResult2= _mm_madd_epi16(mmAA, mmBB);
			mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
			mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
			mmResult= _mm_add_epi64(mmAA, mmResult);
			mmResult= _mm_add_epi64(mmBB, mmResult);
		}
		_mm_storeu_si128((__m128i*)res64, mmResult);
		temp[omp_get_thread_num()]= res64[0]+ res64[1];
	}
	int i = 0;

	for(i= 0; i< MOMP_MAX_THREAD_SIZE; i++)
	{
		sum+= temp[i];
	}

	i= len- (len&15);
	for(; i < len; i++ )
	{
		s += vec1[i] * vec1[i];
	}
	return sum+ s;
}