// MatchImpl.cpp: implementation of the CMatchImpl class. // ////////////////////////////////////////////////////////////////////// #include "stdafx.h" #include "MatchImpl.h" #include "SISMath.h" #ifdef _DEBUG #define new DEBUG_NEW #undef THIS_FILE static char THIS_FILE[] = __FILE__; #endif ////////////////////////////////////////////////////////////////////// // Construction/Destruction ////////////////////////////////////////////////////////////////////// #include //#include //#include #include //#define MOSIS_ASM extern "C"{ int64 _asm_GetCC_8u(const uchar * vec1, const uchar * vec2, int len, int flag);// 0= aa, 1=au, 2=uu }; int64 CSISMath::Get_CCU8_A(const uchar * vec1, const uchar * vec2, int len ) { #if defined(MOSIS_ASM) return _asm_GetCC_8u(vec1, vec2, len, 0); #else #if defined(MOSIS_DEBUG_ALIGNED_UNALIGNED) return Get_CCU8_UA(vec1, vec2, len); #endif int i= 0, s = 0; int64 sum = 0; #ifdef MOSIS_SIMD __m128i mmResult, mmResult2; __m128i mmA, mmB; __m128i mmAA, mmBB; __m128i mmZeroData= _mm_setzero_si128(); mmResult= _mm_setzero_si128(); #ifdef MOSIS_SIMD_INT64_NO int64 res64[2]; int64 sum64= 0; for(i= 0; i <= len- 16; i+= 16) { mmA= _mm_load_si128((__m128i*) (vec1+ i)); mmB= _mm_load_si128((__m128i*) (vec2+ i)); // 1. »óÀ§ µ¥ÀÌÅÍ °è»ê. // 1.1 »óÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù. mmAA= _mm_unpackhi_epi8(mmA, mmZeroData); mmBB= _mm_unpackhi_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmBB); mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData); mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData); mmResult= _mm_add_epi64(mmAA, mmResult); mmResult= _mm_add_epi64(mmBB, mmResult); mmAA= _mm_unpacklo_epi8(mmA, mmZeroData); mmBB= _mm_unpacklo_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmBB); mmResult= _mm_add_epi32(mmResult2, mmResult); } _mm_storeu_si128((__m128i*)res64, mmResult); sum64+= res64[0]+ res64[1]; #endif UINT32 results[4]; for(i= 0; i <= len- 16; i+= 16) { mmA= _mm_load_si128((__m128i*) (vec1+ i)); mmB= _mm_load_si128((__m128i*) (vec2+ i)); // 1. »óÀ§ µ¥ÀÌÅÍ °è»ê. // 1.1 »óÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù. mmAA= _mm_unpackhi_epi8(mmA, mmZeroData); mmBB= _mm_unpackhi_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmBB); mmResult= _mm_add_epi32(mmResult2, mmResult); mmAA= _mm_unpacklo_epi8(mmA, mmZeroData); mmBB= _mm_unpacklo_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmBB); mmResult= _mm_add_epi32(mmResult2, mmResult); } _mm_storeu_si128((__m128i*)results, mmResult); sum+= results[0]+ results[1]+ results[2]+ results[3]; #else for( i = 0; i <= len - 4; i += 4 ) { int e = vec1[i] * vec2[i]; int v = vec1[i + 1] * vec2[i + 1]; e += v; v = vec1[i + 2] * vec2[i + 2]; e += v; v = vec1[i + 3] * vec2[i + 3]; e += v; sum += e; } #endif for( ; i < len; i++ ) { s += vec1[i] * vec2[i]; } return sum + s; #endif }