// SISMath.cpp: implementation of the CSISMath class. // ////////////////////////////////////////////////////////////////////// #include "stdafx.h" #include "SISMath.h" #include #include #include #ifdef _DEBUG #define new DEBUG_NEW #undef THIS_FILE static char THIS_FILE[] = __FILE__; #endif ////////////////////////////////////////////////////////////////////// // Construction/Destruction ////////////////////////////////////////////////////////////////////// #define SIS_SIMD // C++ ¹öÀü¿¡¼­ Intrinsic ¹öÀüÀ¸·Î Àû¿ë ÇÔ.. C++ ¹öÀü »èÁ¦ °¡´É ÇϰÚÀ½.. //#define SIS_ASM // Intrinsic ¹öÀü¿¡¼­ Assembly ¹öÀüÀ¸·Î Àû¿ëÇÔ.. ±¸Çö Å×½ºÆ® ÈÄ ½Ã°£Â÷ ¾ø¾î¼­ IntrinsicÀ¸·Î Àû¿ëÇÔ. //#define SIS_DEBUG_OMP // Intrinsic ¹öÀü¿¡¼­ Open MP Àû¿ë // OpenMP Àû¿ë ºÒ°¡.. ±¸ÇöÈÄ Å×½ºÆ®¿¡¼­ ¼Óµµ ´À·ÁÁü. // SIMD ´Ü¿¡ Àû¿ë ÇßÀ¸³ª ´ëºÎºÐÀÇ ÄÉÀ̽º¿¡¼­ ½Ã°£ÀÌ 2¹è ¼Ò¿ä.. // Thread OverHead , Context switching¿øÀÎÀ¸·Î ÆÇ´ÜµÊ. // Debug ¿¡¼­ µ¿ÀÛÀº Çϳª Release¿¡¼­ Á¤»óµ¿ÀÛ ¾ÈÇÔ. #if defined(SIS_ASM) extern "C"{ int64 _asm_Get_CCU8(const uchar * vec1, const uchar * vec2, int len, int flag);// 0= aa, 1=au, 2=uu }; #endif //#define SIS_DEBUG_ALIGNED_UNALIGNED INT64 CSISMath::GetSum(CSISBuffer buffer) { __m128i mmA, mmB= _mm_setzero_si128(); __m128i mmRes= _mm_setzero_si128(); int64 sum= 0; int64 results[2]; int x, y, width= buffer.GetWidth(), height= buffer.GetHeight(); for(y= 0; y< height; y++) { for(x= 0; x< width; x+= 16) { mmA= _mm_loadu_si128((__m128i*) buffer.GetDataAddress(x, y)); mmA= _mm_sad_epu8(mmA, mmB); mmRes= _mm_add_epi64(mmA, mmRes); } //16À¸·Î ³ª´« ³ª¸ÓÁö ¿µ¿ª ó¸®. for( ; x < width; x++ ) { sum += buffer.GetPixel(x, y); } } _mm_storeu_si128((__m128i*)results, mmRes); sum= results[0]+ results[1]+ sum; return sum; } double CSISMath::GetAverage(CSISBuffer buffer) { return (double)(GetSum(buffer))/(double)(buffer.GetWidth()*buffer.GetHeight()); } INT64 CSISMath::GetAbsDiff(BYTE* pData, BYTE* pData2, int len) { __m128i mmA, mmB; __m128i mmRes= _mm_setzero_si128(); int64 sum= 0; int64 results[2]; int x= 0; for( x= 0; x<= len- 16; x+= 16) { mmA= _mm_loadu_si128((__m128i*) pData+ x); mmB= _mm_loadu_si128((__m128i*) pData2+ x); mmA= _mm_sad_epu8(mmA, mmB); mmRes= _mm_add_epi64(mmA, mmRes); } _mm_storeu_si128((__m128i*)results, mmRes); for( x= 0; x< len; x++) sum+= *(pData+len); sum= results[0]+ results[1]+ sum; return sum; } INT64 CSISMath::GetAbsDiff(CSISBuffer buffer, int x1, int y1, int width, int height, CSISBuffer buffer2, int x2, int y2) { // width= width>>4<<4;// 16À¸·Î ³ª´«´Ù. __m128i mmA, mmB; __m128i mmRes= _mm_setzero_si128(); int64 sum= 0; int64 results[2]; // mmA= _mm_abs_epi16(mmA); // pabsw xmm0,xmm0 // mmA= _mm_hadd_epi16(mmA, mmB); // phaddw xmm0,xmmword ptr [rsp+30h] // mmA= _mm_madd_epi16(mmA, mmB); // pmaddwd xmm0,xmmword ptr [rsp+30h] int x; for(int y= 0; y< height; y++) { for( x= 0; x<= width- 16; x+= 16) { mmA= _mm_loadu_si128((__m128i*) buffer.GetDataAddress(x1+ x, y1+ y)); mmB= _mm_loadu_si128((__m128i*) buffer2.GetDataAddress(x2+ x, y2+ y)); mmA= _mm_sad_epu8(mmA, mmB); mmRes= _mm_add_epi64(mmA, mmRes); } // 16À¸·Î ³ª´« ³ª¸ÓÁö ¿µ¿ª ó¸®. // for( ; x < width; x++ ) // { // sum += abs(buffer.GetPixel(x1+ x, y1+ y) - buffer2.GetPixel(x2+ x, y2+ y)); // } } _mm_storeu_si128((__m128i*)results, mmRes); sum= results[0]+ results[1]+ sum; return sum; } // ¾çÀÇ °î¼±. ÃÖ´ë°ªÀÌ Á¸ÀçÇÑ´Ù. double CSISMath::SearchMaximum(int64 *pData, int nData) { if(pData == NULL) return -1; if(nData < 2) return -1; int64 best= *pData; int64 bestBefore, bestAfter; int iBest= 0; double ret= 0; for(int i= 1; i< nData; i++) { if(best < *(pData+ i)) { iBest= i; best= *(pData+ i); } } if(iBest < 1 || iBest > nData- 2) return -1; bestBefore= *(pData+ iBest- 1); bestAfter= *(pData+ iBest+ 1); if((best- bestAfter) == 0) { ret= iBest+ 0.5; } else if((best- bestBefore) == 0) { ret= iBest- 0.5; } else if(bestBefore < bestAfter) { ret= iBest+ 0.5- 0.5*(best- bestAfter)/(best- bestBefore); } else { ret= iBest- 0.5+ 0.5/(best- bestAfter)*(best- bestBefore); } if(ret > iBest+ 1 || ret < iBest- 1) return -1; return ret; } // À½ÀÇ °î¼±. ÃÖ¼Ò°ªÀÌ Á¸ÀçÇÑ´Ù. double CSISMath::SearchMinimum(int64 *pData, int nData) { if(pData == NULL) return -1; if(nData < 2) return -1; int64 best= *pData; int64 bestBefore, bestAfter; int iBest= 0; double ret= 0; for(int i= 1; i< nData; i++) { if(best > *(pData+ i)) { iBest= i; best= *(pData+ i); } } if(iBest < 1 || iBest > nData- 2) return -1; bestBefore= *(pData+ iBest- 1); bestAfter= *(pData+ iBest+ 1); if((best- bestAfter) == 0) { ret= iBest+ 0.5; } else if((best- bestBefore) == 0) { ret= iBest- 0.5; } else if(bestBefore < bestAfter) { ret= iBest- 0.5+ 0.5/(best- bestAfter)*(best- bestBefore); } else// if(*(pBest-1) > bestAfter) { ret= iBest+ 0.5- 0.5*(best- bestAfter)/(best- bestBefore); } if(ret > iBest+ 1 || ret < iBest- 1) return -1; return ret; } #define SIS_SIMD_INT64 int64 CSISMath::SumPixels_8u( const uchar * vec, int len ) { int i= 0, s = 0; int64 sum = 0; #if defined(SIS_SIMD) int results[4]; __m128i mmResult; __m128i mmA, mmB; __m128i mmZeroData= _mm_setzero_si128(); mmResult= _mm_setzero_si128(); __m128i mmOne= _mm_set1_epi16(1); for(i= 0; i<= len- 16; i+= 16) { mmA= _mm_loadu_si128((__m128i*) (vec+ i)); mmB= _mm_unpackhi_epi8(mmA, mmZeroData); mmA= _mm_unpacklo_epi8(mmA, mmZeroData); mmA= _mm_add_epi16(mmA, mmB); mmA= _mm_madd_epi16(mmA, mmOne); mmResult= _mm_add_epi32(mmA, mmResult); } _mm_storeu_si128((__m128i*)results, mmResult); sum= (int64)results[0]+ results[1]+ results[2]+ results[3]; #else for( i = 0; i <= len - 4; i += 4 ) { sum += vec[i] + vec[i + 1] + vec[i + 2] + vec[i + 3]; } #endif for( ; i < len; i++ ) { s += vec[i]; } return sum + s; } int64 CSISMath::SumPixels_16u( const ushort * vec, int len ) { int64 sum = 0; int i, s = 0; int64 sum2= 0; #if defined(SIS_SIMD) UINT32 results[4]; __m128i mmResult; __m128i mmA; __m128i mmZeroData= _mm_setzero_si128(); mmResult= _mm_setzero_si128(); __m128i mmOne= _mm_set1_epi16(1); for(i= 0; i<= len- 8; i+= 8) { mmA= _mm_loadu_si128((__m128i*) (vec+ i)); // mmA= _mm_add_epi16(mmA, mmB); mmA= _mm_madd_epi16(mmA, mmOne); mmResult= _mm_add_epi32(mmA, mmResult); } _mm_storeu_si128((__m128i*)results, mmResult); sum= (int64)results[0]+ results[1]+ results[2]+ results[3]; #else for( i = 0; i <= len - 8; i += 8 ) { sum += vec[i] + vec[i + 1] + vec[i + 2] + vec[i + 3] + vec[i+ 4] + vec[i + 5] + vec[i + 6] + vec[i + 7]; } #endif for( ; i < len; i++ ) { s += vec[i]; } return sum + s; } int64 CSISMath::Get_CCU8_A(const uchar * vec1, int len ) { #if defined(SIS_DEBUG_OMP) int64 rslt= OMP_CCU8_A(vec1, len); return rslt; #endif #if defined(SIS_ASM) return _asm_Get_CCU8(vec1, NULL, len, 0); #else #if defined(SIS_DEBUG_ALIGNED_UNALIGNED) return Get_CCU8_UA(vec1, vec1, len); #endif int i, s = 0; int64 sum = 0; #ifdef SIS_SIMD __m128i mmResult, mmResult2; __m128i mmA; __m128i mmAA, mmBB; __m128i mmZeroData= _mm_setzero_si128(); #ifdef SIS_SIMD_INT64 int64 res64[2]; mmResult= _mm_setzero_si128(); for(i= 0; i <= len- 16; i+= 16) { mmA= _mm_load_si128((__m128i*) (vec1+ i)); mmAA= _mm_unpackhi_epi8(mmA, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmAA); mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData); mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData); mmResult= _mm_add_epi64(mmAA, mmResult); mmResult= _mm_add_epi64(mmBB, mmResult); mmAA= _mm_unpacklo_epi8(mmA, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmAA); mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData); mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData); mmResult= _mm_add_epi64(mmAA, mmResult); mmResult= _mm_add_epi64(mmBB, mmResult); } _mm_storeu_si128((__m128i*)res64, mmResult); sum= res64[0]+ res64[1]; #else UINT32 results[4]; mmResult= _mm_setzero_si128(); for(i= 0; i <= len- 16; i+= 16) { mmA= _mm_load_si128((__m128i*) (vec1+ i)); // mmB= _mm_load_si128((__m128i*) (vec2+ i)); // 1. »óÀ§ µ¥ÀÌÅÍ °è»ê. // 1.1 »óÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù. mmAA= _mm_unpackhi_epi8(mmA, mmZeroData); // mmBB= _mm_unpackhi_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmAA); mmResult= _mm_add_epi32(mmResult2, mmResult); mmAA= _mm_unpacklo_epi8(mmA, mmZeroData); // mmBB= _mm_unpacklo_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmAA); mmResult= _mm_add_epi32(mmResult2, mmResult); } _mm_storeu_si128((__m128i*)results, mmResult); sum= results[0]+ results[1]+ results[2]+ results[3]; #endif #else for( i = 0; i <= len - 4; i += 4 ) { int e = vec1[i] * vec1[i]; int v = vec1[i + 1] * vec1[i + 1]; e += v; v = vec1[i + 2] * vec1[i + 2]; e += v; v = vec1[i + 3] * vec1[i + 3]; e += v; sum += e; } #endif for( ; i < len; i++ ) { s += vec1[i] * vec1[i]; } return sum + s; #endif } int64 CSISMath::Get_CCU8_AUA(const uchar * vec1, const uchar * vec2, int len ) { #if defined(SIS_DEBUG_OMP) int64 rslt= OMP_CCU8_AUA(vec1, vec2, len); return rslt; #endif #if defined(SIS_ASM) return _asm_Get_CCU8(vec1, vec2, len, 1); #else #if defined(SIS_DEBUG_ALIGNED_UNALIGNED) return Get_CCU8_UA(vec1, vec2, len); #endif int i, s = 0; int64 sum = 0; #ifdef SIS_SIMD __m128i mmResult, mmResult2; __m128i mmA, mmB; __m128i mmAA, mmBB; __m128i mmZeroData= _mm_setzero_si128(); mmResult= _mm_setzero_si128(); #ifdef SIS_SIMD_INT64 int64 res64[2]; for(i= 0; i <= len- 16; i+= 16) { mmA= _mm_load_si128((__m128i*) (vec1+ i)); mmB= _mm_loadu_si128((__m128i*) (vec2+ i)); mmAA= _mm_unpackhi_epi8(mmA, mmZeroData); mmBB= _mm_unpackhi_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmBB); mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData); mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData); mmResult= _mm_add_epi64(mmAA, mmResult); mmResult= _mm_add_epi64(mmBB, mmResult); mmAA= _mm_unpacklo_epi8(mmA, mmZeroData); mmBB= _mm_unpacklo_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmBB); mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData); mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData); mmResult= _mm_add_epi64(mmAA, mmResult); mmResult= _mm_add_epi64(mmBB, mmResult); } _mm_storeu_si128((__m128i*)res64, mmResult); sum= res64[0]+ res64[1]; #else UINT32 results[4]; for(i= 0; i <= len- 16; i+= 16) { mmA= _mm_load_si128((__m128i*) (vec1+ i)); mmB= _mm_loadu_si128((__m128i*) (vec2+ i)); // 1. »óÀ§ µ¥ÀÌÅÍ °è»ê. // 1.1 »óÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù. mmAA= _mm_unpackhi_epi8(mmA, mmZeroData); mmBB= _mm_unpackhi_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmBB); mmResult= _mm_add_epi32(mmResult2, mmResult); mmAA= _mm_unpacklo_epi8(mmA, mmZeroData); mmBB= _mm_unpacklo_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmBB); mmResult= _mm_add_epi32(mmResult2, mmResult); } _mm_storeu_si128((__m128i*)results, mmResult); sum+= results[0]+ results[1]+ results[2]+ results[3]; #endif #else for( i = 0; i <= len - 4; i += 4 ) { int e = vec1[i] * vec2[i]; int v = vec1[i + 1] * vec2[i + 1]; e += v; v = vec1[i + 2] * vec2[i + 2]; e += v; v = vec1[i + 3] * vec2[i + 3]; e += v; sum += e; } #endif for( ; i < len; i++ ) { s += vec1[i] * vec2[i]; } return sum + s; #endif } int64 CSISMath::Get_CCU8_UA(const uchar * vec1, const uchar * vec2, int len ) { #if defined(SIS_ASM) return _asm_Get_CCU8(vec1, vec2, len, 2); #else int i, s = 0; int64 sum = 0; #ifdef SIS_SIMD __m128i mmResult, mmResult2; __m128i mmA, mmB; __m128i mmAA, mmBB; __m128i mmZeroData= _mm_setzero_si128(); mmResult= _mm_setzero_si128(); #ifdef SIS_SIMD_INT64 #endif UINT32 results[4]; for(i= 0; i <= len- 16; i+= 16) { mmA= _mm_loadu_si128((__m128i*) (vec1+ i)); mmB= _mm_loadu_si128((__m128i*) (vec2+ i)); // 1. »óÀ§ µ¥ÀÌÅÍ °è»ê. // 1.1 »óÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù. mmAA= _mm_unpackhi_epi8(mmA, mmZeroData); mmBB= _mm_unpackhi_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmBB); mmResult= _mm_add_epi32(mmResult2, mmResult); mmAA= _mm_unpacklo_epi8(mmA, mmZeroData); mmBB= _mm_unpacklo_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmBB); mmResult= _mm_add_epi32(mmResult2, mmResult); /* // 1. »óÀ§ µ¥ÀÌÅÍ °è»ê. // 1.1 »óÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù. mmAA= _mm_unpackhi_epi8(mmA, mmZeroData); mmBB= _mm_unpackhi_epi8(mmB, mmZeroData); mmResult2= _mm_mullo_epi16(mmAA, mmBB); // 1.2 8°³ÀÇ 16bit µ¥ÀÌÅ͸¦ 4°³ÀÇ 32ºñÆ® µ¥ÀÌÅÍÇü µÎ°³·Î º¯È¯ÈÄ.. ´õÇÑ´Ù. mmAA= _mm_unpackhi_epi16(mmResult2, mmZeroData); mmBB= _mm_unpacklo_epi16(mmResult2, mmZeroData); mmResult2= _mm_add_epi32(mmAA, mmBB); // 1.3 ... ÃÑÇÕ¿¡ ´õÇÑ´Ù. mmResult= _mm_add_epi32(mmResult, mmResult2); // 2. ÇÏÀ§ µ¥ÀÌÅÍ °è»ê. // 2.1 ÇÏÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù. mmAA= _mm_unpacklo_epi8(mmA, mmZeroData); mmBB= _mm_unpacklo_epi8(mmB, mmZeroData); mmResult2= _mm_mullo_epi16(mmAA, mmBB); // 2.2 8°³ÀÇ 16bit µ¥ÀÌÅ͸¦ 4°³ÀÇ 32bit µ¥ÀÌÅÍÇü µÎ°³·Î º¯È¯ÈÄ.. ´õÇÑ´Ù. mmAA= _mm_unpackhi_epi16(mmResult2, mmZeroData); mmBB= _mm_unpacklo_epi16(mmResult2, mmZeroData); mmResult2= _mm_add_epi32(mmAA, mmBB); // 2.3 ... ÃÑÇÕ¿¡ ´õÇÑ´Ù. mmResult= _mm_add_epi32(mmResult, mmResult2); */ } _mm_storeu_si128((__m128i*)results, mmResult); sum+= (int64)results[0]+ results[1]+ results[2]+ results[3]; #else for( i = 0; i <= len - 4; i += 4 ) { int e = vec1[i] * vec2[i]; int v = vec1[i + 1] * vec2[i + 1]; e += v; v = vec1[i + 2] * vec2[i + 2]; e += v; v = vec1[i + 3] * vec2[i + 3]; e += v; sum += e; } #endif for( ; i < len; i++ ) { s += vec1[i] * vec2[i]; } return sum + s; #endif } int64 CSISMath::Get_CCU8_UA(const uchar * vec1, int len ) { #if defined(SIS_DEBUG_OMP) int64 rslt= OMP_CCU8_UA(vec1, len); return rslt; #endif #if defined(SIS_ASM) return _asm_Get_CCU8(vec1, NULL, len, 2); #else int i, s = 0; int64 sum = 0; #ifdef SIS_SIMD __m128i mmResult, mmResult2; __m128i mmA; __m128i mmAA, mmBB; __m128i mmZeroData= _mm_setzero_si128(); #ifdef SIS_SIMD_INT64 int64 res64[2]; mmResult= _mm_setzero_si128(); for(i= 0; i <= len- 16; i+= 16) { mmA= _mm_loadu_si128((__m128i*) (vec1+ i)); mmAA= _mm_unpackhi_epi8(mmA, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmAA); mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData); mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData); mmResult= _mm_add_epi64(mmAA, mmResult); mmResult= _mm_add_epi64(mmBB, mmResult); mmAA= _mm_unpacklo_epi8(mmA, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmAA); mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData); mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData); mmResult= _mm_add_epi64(mmAA, mmResult); mmResult= _mm_add_epi64(mmBB, mmResult); } _mm_storeu_si128((__m128i*)res64, mmResult); sum= res64[0]+ res64[1]; #else UINT32 results[4]; mmResult= _mm_setzero_si128(); for(i= 0; i <= len- 16; i+= 16) { mmA= _mm_loadu_si128((__m128i*) (vec1+ i)); // mmB= _mm_loadu_si128((__m128i*) (vec2+ i)); // 1. »óÀ§ µ¥ÀÌÅÍ °è»ê. // 1.1 »óÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù. mmAA= _mm_unpackhi_epi8(mmA, mmZeroData); // mmBB= _mm_unpackhi_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmAA); mmResult= _mm_add_epi32(mmResult2, mmResult); mmAA= _mm_unpacklo_epi8(mmA, mmZeroData); // mmBB= _mm_unpacklo_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmAA); mmResult= _mm_add_epi32(mmResult2, mmResult); } _mm_storeu_si128((__m128i*)results, mmResult); sum+= results[0]+ results[1]+ results[2]+ results[3]; #endif #else for( i = 0; i <= len - 4; i += 4 ) { int e = vec1[i] * vec1[i]; int v = vec1[i + 1] * vec1[i + 1]; e += v; v = vec1[i + 2] * vec1[i + 2]; e += v; v = vec1[i + 3] * vec1[i + 3]; e += v; sum += e; } #endif for( ; i < len; i++ ) { s += vec1[i] * vec1[i]; } return sum + s; #endif } int64 CSISMath::Get_CCU8( CSISBuffer buffer1, int x1, int y1, int width, int height, CSISBuffer buffer2, int x2, int y2) { int64 sum = 0; int64 sum2= 0; int h, w; __m128i mmA, mmB; __m128i mmAA, mmBB; __m128i mmResult, mmResult2; __m128i mmZeroData= _mm_setzero_si128(); mmResult= _mm_setzero_si128(); for(h= 0; h< height; h++) { for(w= 0; w < width- 16; w+= 16) { mmA= _mm_loadu_si128((__m128i*) buffer1.GetDataAddress(x1+ w, y1+ h)); mmB= _mm_loadu_si128((__m128i*) buffer2.GetDataAddress(x2+ w, y2+ h)); mmAA= _mm_unpackhi_epi8(mmA, mmZeroData); mmBB= _mm_unpackhi_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmBB); mmResult= _mm_add_epi32(mmResult2, mmResult); mmAA= _mm_unpacklo_epi8(mmA, mmZeroData); mmBB= _mm_unpacklo_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmBB); mmResult= _mm_add_epi32(mmResult2, mmResult); } // for(; w< width; w++) // { // sum2+= buffer1.GetPixel(x1+ w, y1+ h)*buffer2.GetPixel(x2+ w, y2+ h); // } } UINT32 results[4]; _mm_storeu_si128((__m128i*)results, mmResult); sum= (int64)results[0]+ results[1]+ results[2]+ results[3]+ sum2; return sum; } int64 CSISMath::Get_CCU16_A( const ushort * vec1, const ushort * vec2, int len ) { #if defined(SIS_DEBUG_ALIGNED_UNALIGNED) return Get_CCU16_UA(vec1, vec2, len); #endif int i, s = 0; int64 sum = 0; #ifdef SIS_SIMD __m128i mmResult, mmResult2; __m128i mmA, mmB; mmResult= _mm_setzero_si128(); UINT32 results[4]; for(i= 0; i <= len- 8; i+= 8) { mmA= _mm_load_si128((__m128i*) (vec1+ i)); mmB= _mm_load_si128((__m128i*) (vec2+ i)); mmResult2= _mm_madd_epi16(mmA, mmB); mmResult= _mm_add_epi32(mmResult2, mmResult); } _mm_storeu_si128((__m128i*)results, mmResult); sum= (int64)results[0]+ results[1]+ results[2]+ results[3]; #else for( i = 0; i <= len - 4; i += 4 ) { int e = vec1[i] * vec2[i]; int v = vec1[i + 1] * vec2[i + 1]; e += v; v = vec1[i + 2] * vec2[i + 2]; e += v; v = vec1[i + 3] * vec2[i + 3]; e += v; sum += e; } #endif for( ; i < len; i++ ) { s += vec1[i] * vec2[i]; } return sum + s; } int64 CSISMath::Get_CCU16_A( const ushort * vec1, int len ) { #if defined(SIS_DEBUG_ALIGNED_UNALIGNED) return Get_CCU16_UA(vec1, vec1, len); #endif int i, s = 0; int64 sum = 0; #ifdef SIS_SIMD __m128i mmResult, mmResult2; __m128i mmA; mmResult= _mm_setzero_si128(); UINT32 results[4]; for(i= 0; i <= len- 8; i+= 8) { mmA= _mm_load_si128((__m128i*) (vec1+ i)); // mmB= _mm_load_si128((__m128i*) (vec2+ i)); mmResult2= _mm_madd_epi16(mmA, mmA); mmResult= _mm_add_epi32(mmResult2, mmResult); } _mm_storeu_si128((__m128i*)results, mmResult); sum= (int64)results[0]+ results[1]+ results[2]+ results[3]; #else for( i = 0; i <= len - 4; i += 4 ) { int e = vec1[i] * vec1[i]; int v = vec1[i + 1] * vec1[i + 1]; e += v; v = vec1[i + 2] * vec1[i + 2]; e += v; v = vec1[i + 3] * vec1[i + 3]; e += v; sum += e; } #endif for( ; i < len; i++ ) { s += vec1[i] * vec1[i]; } return sum + s; } int64 CSISMath::Get_CCU16_UA( const ushort * vec1, const ushort * vec2, int len ) { int i, s = 0; int64 sum = 0; #ifdef SIS_SIMD __m128i mmResult, mmResult2; __m128i mmA, mmB; mmResult= _mm_setzero_si128(); UINT32 results[4]; for(i= 0; i <= len- 8; i+= 8) { mmA= _mm_loadu_si128((__m128i*) (vec1+ i)); mmB= _mm_loadu_si128((__m128i*) (vec2+ i)); mmResult2= _mm_madd_epi16(mmA, mmB); mmResult= _mm_add_epi32(mmResult2, mmResult); } _mm_storeu_si128((__m128i*)results, mmResult); sum= (int64)results[0]+ results[1]+ results[2]+ results[3]; #else for( i = 0; i <= len - 4; i += 4 ) { int e = vec1[i] * vec2[i]; int v = vec1[i + 1] * vec2[i + 1]; e += v; v = vec1[i + 2] * vec2[i + 2]; e += v; v = vec1[i + 3] * vec2[i + 3]; e += v; sum += e; } #endif for( ; i < len; i++ ) { s += vec1[i] * vec2[i]; } return sum + s; } int64 CSISMath::Get_CCU16_UA( const ushort * vec1, int len ) { int i, s = 0; int64 sum = 0; #ifdef SIS_SIMD __m128i mmResult, mmResult2; __m128i mmA; mmResult= _mm_setzero_si128(); UINT32 results[4]; for(i= 0; i <= len- 8; i+= 8) { mmA= _mm_loadu_si128((__m128i*) (vec1+ i)); mmResult2= _mm_madd_epi16(mmA, mmA); mmResult= _mm_add_epi32(mmResult2, mmResult); } _mm_storeu_si128((__m128i*)results, mmResult); sum= (int64)results[0]+ results[1]+ results[2]+ results[3]; #else for( i = 0; i <= len - 4; i += 4 ) { int e = vec1[i] * vec1[i]; int v = vec1[i + 1] * vec1[i + 1]; e += v; v = vec1[i + 2] * vec1[i + 2]; e += v; v = vec1[i + 3] * vec1[i + 3]; e += v; sum += e; } #endif for( ; i < len; i++ ) { s += vec1[i] * vec1[i]; } return sum + s; } int64 CSISMath::Get_CCU16_AUA( const ushort * vec1, const ushort * vec2, int len ) { #if defined(SIS_DEBUG_ALIGNED_UNALIGNED) return Get_CCU16_UA(vec1, vec2, len); #endif int i, s = 0; int64 sum = 0; #ifdef SIS_SIMD __m128i mmResult, mmResult2; __m128i mmA, mmB; mmResult= _mm_setzero_si128(); UINT32 results[4]; for(i= 0; i <= len- 8; i+= 8) { mmA= _mm_load_si128((__m128i*) (vec1+ i)); mmB= _mm_loadu_si128((__m128i*) (vec2+ i)); mmResult2= _mm_madd_epi16(mmA, mmB); mmResult= _mm_add_epi32(mmResult2, mmResult); } _mm_storeu_si128((__m128i*)results, mmResult); sum= (int64)results[0]+ results[1]+ results[2]+ results[3]; #else for( i = 0; i <= len - 4; i += 4 ) { int e = vec1[i] * vec2[i]; int v = vec1[i + 1] * vec2[i + 1]; e += v; v = vec1[i + 2] * vec2[i + 2]; e += v; v = vec1[i + 3] * vec2[i + 3]; e += v; sum += e; } #endif for( ; i < len; i++ ) { s += vec1[i] * vec2[i]; } return sum + s; }