// SISMath.cpp: implementation of the CSISMath class.
|
//
|
//////////////////////////////////////////////////////////////////////
|
|
#include "stdafx.h"
|
#include "SISMath.h"
|
|
|
#include <emmintrin.h>
|
#include <tmmintrin.h>
|
#include <omp.h>
|
|
#ifdef _DEBUG
|
#define new DEBUG_NEW
|
#undef THIS_FILE
|
static char THIS_FILE[] = __FILE__;
|
#endif
|
|
//////////////////////////////////////////////////////////////////////
|
// Construction/Destruction
|
//////////////////////////////////////////////////////////////////////
|
|
#define SIS_SIMD
|
|
#define MOMP_MAX_THREAD_SIZE 4
|
|
//#define SIS_ASM
|
|
|
int64 CSISMath::OMP_CCU8_A(const uchar * vec1, int len)
|
{
|
int s = 0;
|
int64 sum = 0;
|
|
int64 sumT[MOMP_MAX_THREAD_SIZE] = {0};
|
#pragma omp parallel num_threads(MOMP_MAX_THREAD_SIZE)
|
{
|
int i= 0;
|
int templen= len- 16;
|
__m128i mmResult, mmResult2;
|
__m128i mmA;
|
__m128i mmAA, mmBB;
|
__m128i mmZeroData= _mm_setzero_si128();
|
|
int64 res64[2];
|
mmResult= _mm_setzero_si128();
|
#pragma omp for private(i)
|
for(i= 0; i <= templen; i+= 16)
|
{
|
mmA= _mm_load_si128((__m128i*) (vec1+ i));
|
|
mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
|
mmResult2= _mm_madd_epi16(mmAA, mmAA);
|
mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
|
mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
|
mmResult= _mm_add_epi64(mmAA, mmResult);
|
mmResult= _mm_add_epi64(mmBB, mmResult);
|
|
mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
|
mmResult2= _mm_madd_epi16(mmAA, mmAA);
|
mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
|
mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
|
mmResult= _mm_add_epi64(mmAA, mmResult);
|
mmResult= _mm_add_epi64(mmBB, mmResult);
|
}
|
_mm_storeu_si128((__m128i*)res64, mmResult);
|
sumT[omp_get_thread_num()]= res64[0]+ res64[1];
|
}
|
|
int i = 0;
|
for (i = 0; i < MOMP_MAX_THREAD_SIZE; i++)
|
{
|
sum += sumT[i];
|
}
|
|
i= len- (len&15);
|
for(; i < len; i++ )
|
{
|
s += vec1[i] * vec1[i];
|
}
|
|
return sum + s;
|
}
|
|
int64 CSISMath::OMP_CCU8_UA(const uchar *vec1, int len)
|
{
|
int s = 0;
|
int64 sum = 0;
|
|
int64 sumT[MOMP_MAX_THREAD_SIZE];
|
#pragma omp parallel num_threads(MOMP_MAX_THREAD_SIZE)
|
{
|
int i = 0;
|
int templen= len- 16;
|
int64 res64[2];
|
__m128i mmResult, mmResult2;
|
__m128i mmA, mmAA, mmBB;
|
__m128i mmZeroData= _mm_setzero_si128();
|
mmResult= mmZeroData;
|
#pragma omp for private(i)
|
for(i= 0; i <= templen; i+= 16)
|
{
|
mmA= _mm_loadu_si128((__m128i*) (vec1+ i));
|
|
mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
|
mmResult2= _mm_madd_epi16(mmAA, mmAA);
|
mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
|
mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
|
mmResult= _mm_add_epi64(mmAA, mmResult);
|
mmResult= _mm_add_epi64(mmBB, mmResult);
|
|
mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
|
mmResult2= _mm_madd_epi16(mmAA, mmAA);
|
mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
|
mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
|
mmResult= _mm_add_epi64(mmAA, mmResult);
|
mmResult= _mm_add_epi64(mmBB, mmResult);
|
}
|
_mm_storeu_si128((__m128i*)res64, mmResult);
|
sumT[omp_get_thread_num()]= res64[0]+ res64[1];
|
}
|
|
int i = 0;
|
for(i= 0; i< MOMP_MAX_THREAD_SIZE; i++)
|
{
|
sum+= sumT[i];
|
}
|
|
|
i= len- (len&15);
|
for(; i < len; i++ )
|
{
|
s += vec1[i] * vec1[i];
|
}
|
|
return sum+ s;
|
}
|
|
int64 CSISMath::OMP_CCU8_AUA( const uchar * vec1, const uchar * vec2, int len )
|
{
|
int s = 0;
|
int64 sum = 0;
|
int64 temp[MOMP_MAX_THREAD_SIZE];
|
|
|
|
#pragma omp parallel num_threads(MOMP_MAX_THREAD_SIZE)
|
{
|
int i = 0;
|
int templen= len- 16;
|
__m128i mmResult, mmResult2;
|
__m128i mmA, mmB;
|
__m128i mmAA, mmBB;
|
__m128i mmZeroData= _mm_setzero_si128();
|
mmResult= _mm_setzero_si128();
|
int64 res64[2];
|
|
#pragma omp for private(i)
|
for(i= 0; i <= templen; i+= 16)
|
{
|
mmA= _mm_load_si128((__m128i*) (vec1+ i));
|
mmB= _mm_loadu_si128((__m128i*) (vec2+ i));
|
|
mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpackhi_epi8(mmB, mmZeroData);
|
mmResult2= _mm_madd_epi16(mmAA, mmBB);
|
mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
|
mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
|
mmResult= _mm_add_epi64(mmAA, mmResult);
|
mmResult= _mm_add_epi64(mmBB, mmResult);
|
|
mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpacklo_epi8(mmB, mmZeroData);
|
mmResult2= _mm_madd_epi16(mmAA, mmBB);
|
mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
|
mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
|
mmResult= _mm_add_epi64(mmAA, mmResult);
|
mmResult= _mm_add_epi64(mmBB, mmResult);
|
}
|
_mm_storeu_si128((__m128i*)res64, mmResult);
|
temp[omp_get_thread_num()]= res64[0]+ res64[1];
|
}
|
int i = 0;
|
|
for(i= 0; i< MOMP_MAX_THREAD_SIZE; i++)
|
{
|
sum+= temp[i];
|
}
|
|
i= len- (len&15);
|
for(; i < len; i++ )
|
{
|
s += vec1[i] * vec1[i];
|
}
|
return sum+ s;
|
}
|