// MatchImpl.cpp: implementation of the CMatchImpl class.
|
//
|
//////////////////////////////////////////////////////////////////////
|
|
#include "stdafx.h"
|
#include "MatchImpl.h"
|
#include "SISMath.h"
|
|
#ifdef _DEBUG
|
#define new DEBUG_NEW
|
#undef THIS_FILE
|
static char THIS_FILE[] = __FILE__;
|
#endif
|
|
//////////////////////////////////////////////////////////////////////
|
// Construction/Destruction
|
//////////////////////////////////////////////////////////////////////
|
#include <emmintrin.h>
|
//#include <tmmintrin.h>
|
//#include <ia32intrin.h>
|
#include <math.h>
|
|
//#define MOSIS_ASM
|
|
extern "C"{
|
int64 _asm_GetCC_8u(const uchar * vec1, const uchar * vec2, int len, int flag);// 0= aa, 1=au, 2=uu
|
};
|
|
|
|
int64 CSISMath::Get_CCU8_A(const uchar * vec1, const uchar * vec2, int len )
|
{
|
#if defined(MOSIS_ASM)
|
return _asm_GetCC_8u(vec1, vec2, len, 0);
|
#else
|
#if defined(MOSIS_DEBUG_ALIGNED_UNALIGNED)
|
return Get_CCU8_UA(vec1, vec2, len);
|
#endif
|
int i= 0, s = 0;
|
int64 sum = 0;
|
|
|
#ifdef MOSIS_SIMD
|
__m128i mmResult, mmResult2;
|
__m128i mmA, mmB;
|
__m128i mmAA, mmBB;
|
__m128i mmZeroData= _mm_setzero_si128();
|
mmResult= _mm_setzero_si128();
|
|
#ifdef MOSIS_SIMD_INT64_NO
|
int64 res64[2];
|
int64 sum64= 0;
|
for(i= 0; i <= len- 16; i+= 16)
|
{
|
mmA= _mm_load_si128((__m128i*) (vec1+ i));
|
mmB= _mm_load_si128((__m128i*) (vec2+ i));
|
|
// 1. »óÀ§ µ¥ÀÌÅÍ °è»ê.
|
// 1.1 »óÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù.
|
mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpackhi_epi8(mmB, mmZeroData);
|
|
mmResult2= _mm_madd_epi16(mmAA, mmBB);
|
mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
|
mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
|
mmResult= _mm_add_epi64(mmAA, mmResult);
|
mmResult= _mm_add_epi64(mmBB, mmResult);
|
|
|
mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpacklo_epi8(mmB, mmZeroData);
|
|
mmResult2= _mm_madd_epi16(mmAA, mmBB);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
}
|
|
_mm_storeu_si128((__m128i*)res64, mmResult);
|
sum64+= res64[0]+ res64[1];
|
#endif
|
UINT32 results[4];
|
|
for(i= 0; i <= len- 16; i+= 16)
|
{
|
|
mmA= _mm_load_si128((__m128i*) (vec1+ i));
|
mmB= _mm_load_si128((__m128i*) (vec2+ i));
|
|
// 1. »óÀ§ µ¥ÀÌÅÍ °è»ê.
|
// 1.1 »óÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù.
|
mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpackhi_epi8(mmB, mmZeroData);
|
|
mmResult2= _mm_madd_epi16(mmAA, mmBB);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
|
|
mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpacklo_epi8(mmB, mmZeroData);
|
|
mmResult2= _mm_madd_epi16(mmAA, mmBB);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
}
|
|
_mm_storeu_si128((__m128i*)results, mmResult);
|
sum+= results[0]+ results[1]+ results[2]+ results[3];
|
|
#else
|
for( i = 0; i <= len - 4; i += 4 )
|
{
|
int e = vec1[i] * vec2[i];
|
int v = vec1[i + 1] * vec2[i + 1];
|
|
e += v;
|
v = vec1[i + 2] * vec2[i + 2];
|
e += v;
|
v = vec1[i + 3] * vec2[i + 3];
|
e += v;
|
sum += e;
|
}
|
#endif
|
|
for( ; i < len; i++ )
|
{
|
s += vec1[i] * vec2[i];
|
}
|
|
return sum + s;
|
#endif
|
}
|