#include "stdafx.h" #include "MatchImpl.h" #include //#include #include #define MOSIS_SIMD int64 CrossCorr_8u_C1(const uchar * vec1, const uchar * vec2, int len ) { int i, s = 0; int64 sum = 0; #ifdef MOSIS_SIMD __m128i mmResult, mmResult2; __m128i mmA, mmB; __m128i mmAA, mmBB; __m128i mmZeroData= _mm_setzero_si128(); mmResult= _mm_setzero_si128(); int results[4]; for(i= 0; i <= len- 16; i+= 16) { mmA= _mm_loadu_si128((__m128i*) (vec1+ i)); mmB= _mm_loadu_si128((__m128i*) (vec2+ i)); // 1. »óÀ§ µ¥ÀÌÅÍ °è»ê. // 1.1 »óÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù. mmAA= _mm_unpackhi_epi8(mmA, mmZeroData); mmBB= _mm_unpackhi_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmBB); mmResult= _mm_add_epi32(mmResult2, mmResult); mmAA= _mm_unpacklo_epi8(mmA, mmZeroData); mmBB= _mm_unpacklo_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmBB); mmResult= _mm_add_epi32(mmResult2, mmResult); /* // 1. »óÀ§ µ¥ÀÌÅÍ °è»ê. // 1.1 »óÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù. mmAA= _mm_unpackhi_epi8(mmA, mmZeroData); mmBB= _mm_unpackhi_epi8(mmB, mmZeroData); mmResult2= _mm_mullo_epi16(mmAA, mmBB); // 1.2 8°³ÀÇ 16bit µ¥ÀÌÅ͸¦ 4°³ÀÇ 32ºñÆ® µ¥ÀÌÅÍÇü µÎ°³·Î º¯È¯ÈÄ.. ´õÇÑ´Ù. mmAA= _mm_unpackhi_epi16(mmResult2, mmZeroData); mmBB= _mm_unpacklo_epi16(mmResult2, mmZeroData); mmResult2= _mm_add_epi32(mmAA, mmBB); // 1.3 ... ÃÑÇÕ¿¡ ´õÇÑ´Ù. mmResult= _mm_add_epi32(mmResult, mmResult2); // 2. ÇÏÀ§ µ¥ÀÌÅÍ °è»ê. // 2.1 ÇÏÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù. mmAA= _mm_unpacklo_epi8(mmA, mmZeroData); mmBB= _mm_unpacklo_epi8(mmB, mmZeroData); mmResult2= _mm_mullo_epi16(mmAA, mmBB); // 2.2 8°³ÀÇ 16bit µ¥ÀÌÅ͸¦ 4°³ÀÇ 32bit µ¥ÀÌÅÍÇü µÎ°³·Î º¯È¯ÈÄ.. ´õÇÑ´Ù. mmAA= _mm_unpackhi_epi16(mmResult2, mmZeroData); mmBB= _mm_unpacklo_epi16(mmResult2, mmZeroData); mmResult2= _mm_add_epi32(mmAA, mmBB); // 2.3 ... ÃÑÇÕ¿¡ ´õÇÑ´Ù. mmResult= _mm_add_epi32(mmResult, mmResult2); */ } _mm_storeu_si128((__m128i*)results, mmResult); sum+= results[0]+ results[1]+ results[2]+ results[3]; #else for( i = 0; i <= len - 4; i += 4 ) { int e = vec1[i] * vec2[i]; int v = vec1[i + 1] * vec2[i + 1]; e += v; v = vec1[i + 2] * vec2[i + 2]; e += v; v = vec1[i + 3] * vec2[i + 3]; e += v; sum += e; } #endif for( ; i < len; i++ ) { s += vec1[i] * vec2[i]; } return sum + s; } int64 CrossCorr_8u_C1( CMosisBuffer buffer1, int x1, int y1, int width, int height, CMosisBuffer buffer2, int x2, int y2) { int64 sum = 0; int h, w; __m128i mmA, mmB; __m128i mmAA, mmBB; __m128i mmResult, mmResult2; __m128i mmZeroData= _mm_setzero_si128(); mmResult= _mm_setzero_si128(); for(h= 0; h< height; h++) { for(w= 0; w < width- 16; w+= 16) { mmA= _mm_loadu_si128((__m128i*) buffer1.GetDataAddress(x1+ w, y1+ h)); mmB= _mm_loadu_si128((__m128i*) buffer2.GetDataAddress(x2+ w, y2+ h)); mmAA= _mm_unpackhi_epi8(mmA, mmZeroData); mmBB= _mm_unpackhi_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmBB); mmResult= _mm_add_epi32(mmResult2, mmResult); mmAA= _mm_unpacklo_epi8(mmA, mmZeroData); mmBB= _mm_unpacklo_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmBB); mmResult= _mm_add_epi32(mmResult2, mmResult); } } int results[4]; _mm_storeu_si128((__m128i*)results, mmResult); sum= results[0]+ results[1]+ results[2]+ results[3]; return sum; } int64 CrossCorr_16u_C1( const ushort * vec1, const ushort * vec2, int len ) { int i, s = 0; int64 sum = 0; #ifdef MOSIS_SIMD __m128i mmResult, mmResult2; __m128i mmA, mmB; __m128i mmAA, mmBB; mmResult= _mm_setzero_si128(); int results[4]; for(i= 0; i <= len- 8; i+= 8) { mmA= _mm_loadu_si128((__m128i*) (vec1+ i)); mmB= _mm_loadu_si128((__m128i*) (vec2+ i)); mmResult2= _mm_madd_epi16(mmA, mmB); mmResult= _mm_add_epi32(mmResult2, mmResult); } _mm_storeu_si128((__m128i*)results, mmResult); sum= results[0]+ results[1]+ results[2]+ results[3]; #else for( i = 0; i <= len - 4; i += 4 ) { int e = vec1[i] * vec2[i]; int v = vec1[i + 1] * vec2[i + 1]; e += v; v = vec1[i + 2] * vec2[i + 2]; e += v; v = vec1[i + 3] * vec2[i + 3]; e += v; sum += e; } #endif for( ; i < len; i++ ) { s += vec1[i] * vec2[i]; } return sum + s; } // À½ÀÇ °î¼±. ÃÖ¼Ò°ªÀÌ Á¸ÀçÇÑ´Ù. float SearchMaximum(int64 *pData, int nData) { int64 best= 0; int64 bestBefore, bestAfter; int iBest; float ret= 0; for(int i= 0; i< nData; i++) { if(best < *(pData+ i)) { iBest= i; best= *(pData+ i); } } if(iBest < 1 || iBest > nData- 2) return -1; bestBefore= *(pData+ iBest- 1); bestAfter= *(pData+ iBest+ 1); if(best == bestAfter) { ret= iBest+ 0.5; } else if(bestBefore < bestAfter) { ret= iBest+ 0.5- 0.5*(best- bestAfter)/(best- bestBefore); } else { ret= iBest- 0.5+ 0.5/(best- bestAfter)*(best- bestBefore); } if(ret > iBest+ 1 || ret < iBest- 1) return -1; return ret; } // À½ÀÇ °î¼±. ÃÖ¼Ò°ªÀÌ Á¸ÀçÇÑ´Ù. float SearchMinimum(int64 *pData, int nData) { int64 best= *pData; int64 bestBefore, bestAfter; int iBest; float ret= 0; for(int i= 0; i< nData; i++) { if(best >= *(pData+ i)) { iBest= i; best= *(pData+ i); } } if(iBest < 1 || iBest > nData- 2) return -1; bestBefore= *(pData+ iBest- 1); bestAfter= *(pData+ iBest+ 1); if(best == bestAfter) { ret= iBest+ 0.5; } else if(best == bestBefore) { ret= iBest- 0.5; } else if(bestBefore < bestAfter) { ret= iBest- 0.5+ 0.5/(best- bestAfter)*(best- bestBefore); } else// if(*(pBest-1) > bestAfter) { ret= iBest+ 0.5- 0.5*(best- bestAfter)/(best- bestBefore); } if(ret > iBest+ 1 || ret < iBest- 1) return -1; return ret; } AFX_EXT_API INT64 GetAbsoluteDifference(CMosisBuffer buffer, int x1, int y1, int width, int height, int x2, int y2) { // width= width>>4<<4;// 16À¸·Î ³ª´«´Ù. __m128i mmA, mmB; __m128i mmRes= _mm_setzero_si128(); int64 sum; int64 results[2]; for(int y= 0; y< height; y++) { for(int x= 0; x<= width- 16; x+= 16) { mmA= _mm_loadu_si128((__m128i*) buffer.GetDataAddress(x1+ x, y1+ y)); mmB= _mm_loadu_si128((__m128i*) buffer.GetDataAddress(x2+ x, y2+ y)); mmA= _mm_sad_epu8(mmA, mmB); mmRes= _mm_add_epi64(mmA, mmRes); } } _mm_storeu_si128((__m128i*)results, mmRes); sum= results[0]+ results[1]; return sum; } AFX_EXT_API INT64 GetAbsoluteDifference(CMosisBuffer buffer, int x1, int y1, int width, int height, CMosisBuffer buffer2, int x2, int y2) { // width= width>>4<<4;// 16À¸·Î ³ª´«´Ù. __m128i mmA, mmB; __m128i mmRes= _mm_setzero_si128(); int64 sum; int64 results[2]; for(int y= 0; y< height; y++) { for(int x= 0; x<= width- 16; x+= 16) { mmA= _mm_loadu_si128((__m128i*) buffer.GetDataAddress(x1+ x, y1+ y)); mmB= _mm_loadu_si128((__m128i*) buffer2.GetDataAddress(x2+ x, y2+ y)); mmA= _mm_sad_epu8(mmA, mmB); mmRes= _mm_add_epi64(mmA, mmRes); } } _mm_storeu_si128((__m128i*)results, mmRes); sum= results[0]+ results[1]; return sum; } AFX_EXT_API INT64 GetAbsoluteDifference(CMosisBuffer buffer, int x1, int y1, int width, int height, int x2, int y2, float subpixel) { width= width>>4;// 16À¸·Î ³ª´«´Ù. __m128i mmZero= _mm_setzero_si128(); __m128i mmA, mmB; __m128i mmAA, mmBB; __m128i mmRes= _mm_setzero_si128(); int64 sum= 0; int64 results[2]; for(int y= y1; y< y1+ height; y++) { for(int x= 0; x<= width- 16; x+= 16) { mmA= _mm_loadu_si128((__m128i*) buffer.GetDataAddress(x1+ x, y)); mmB= _mm_loadu_si128((__m128i*) buffer.GetDataAddress(x2+ x, y)); mmBB= _mm_loadu_si128((__m128i*) buffer.GetDataAddress(x2+ x+ 1, y)); mmB= _mm_sub_epi8(mmA, mmB); mmBB= _mm_sub_epi8(mmA, mmBB); } } return sum; } void VertConvSIMD(unsigned char *pSrc, unsigned char *pDest, unsigned int nImageWidth, unsigned int nImageHeight, float pitch, int Threshold) { int nPitch = pitch; int SubPixel = (pitch-nPitch)*16; short SrcValue = 0; short DestValue = 0; short DestValue1 = 0; short DestValue2 = 0; short SubValue = 0; __m128i ZeroData = _mm_setzero_si128(); __m128i DataN = _mm_set1_epi16(-1); __m128i SubPixel128 = _mm_set1_epi16(16-SubPixel); __m128i SubPixel2128 = _mm_set1_epi16(SubPixel); __m128i Thres128 = _mm_set1_epi16(Threshold); __m128i NThres128 = _mm_set1_epi16(Threshold*-1); __m128i SrcHigh = _mm_setzero_si128(); __m128i Srclow = _mm_setzero_si128(); __m128i Dest1High = _mm_setzero_si128(); __m128i Dest1Low = _mm_setzero_si128(); __m128i Dest2High = _mm_setzero_si128(); __m128i Dest2Low = _mm_setzero_si128(); __m128i Resulthigh = _mm_setzero_si128(); __m128i ResultLow = _mm_setzero_si128(); int mask[4] = {0}; int mask2 = 0; //0~pitch °Å¸®±îÁö for( int i = 0; i < nImageWidth; i+=16) { for( int j = nPitch+1; j < nImageHeight-nPitch-1; j++) { Srclow = _mm_loadu_si128((__m128i*)(pSrc+i+j*nImageWidth)); SrcHigh = _mm_unpackhi_epi8(Srclow,ZeroData); //»óÀ§ À§Ä¡ shortÇü º¯È¯ Srclow = _mm_unpacklo_epi8(Srclow,ZeroData); //ÇÏÀ§ À§Ä¡ shortÇü º¯È¯ Dest1Low = _mm_loadu_si128((__m128i*)(pSrc+i+(j+nPitch)*nImageWidth)); Dest1High = _mm_unpackhi_epi8(Dest1Low,ZeroData); //»óÀ§ À§Ä¡ shortÇü º¯È¯ Dest1Low = _mm_unpacklo_epi8(Dest1Low,ZeroData); //ÇÏÀ§ À§Ä¡ shortÇü º¯È¯ Dest1High = _mm_mullo_epi16(Dest1High,SubPixel128); Dest1Low = _mm_mullo_epi16(Dest1Low,SubPixel128); Dest2Low = _mm_loadu_si128((__m128i*)(pSrc+i+(j+nPitch+1)*nImageWidth)); Dest2High = _mm_unpackhi_epi8(Dest2Low,ZeroData); //»óÀ§ À§Ä¡ shortÇü º¯È¯ Dest2Low = _mm_unpacklo_epi8(Dest2Low,ZeroData); //ÇÏÀ§ À§Ä¡ shortÇü º¯È¯ Dest2High = _mm_mullo_epi16(Dest2High,SubPixel2128); Dest2Low = _mm_mullo_epi16(Dest2Low,SubPixel2128); Dest1High = _mm_adds_epi16(Dest1High,Dest2High); Dest1Low = _mm_adds_epi16(Dest1Low,Dest2Low); Dest1High = _mm_srli_epi16(Dest1High,4); Dest1Low = _mm_srli_epi16(Dest1Low,4); Dest1High = _mm_subs_epi16(Dest1High,SrcHigh); Dest1Low = _mm_subs_epi16(Dest1Low,Srclow); Dest2High = Dest1High; Dest2Low = Dest1Low; Resulthigh = Dest1High; ResultLow = Dest1Low; Dest1High = _mm_cmpgt_epi16(Dest1High,Thres128); Dest1Low = _mm_cmpgt_epi16(Dest1Low,Thres128); mask[0] = _mm_movemask_epi8(Dest1High); mask[1] = _mm_movemask_epi8(Dest1Low); Dest1High = _mm_cmplt_epi16(Dest2High,NThres128); Dest1Low = _mm_cmplt_epi16(Dest2Low,NThres128); mask[2] = _mm_movemask_epi8(Dest1High); mask[3] = _mm_movemask_epi8(Dest1Low); if(mask[0] ||mask[1] ||mask[2] ||mask[3] ) { Dest1Low = _mm_loadu_si128((__m128i*)(pSrc+i+(j-nPitch)*nImageWidth)); Dest1High = _mm_unpackhi_epi8(Dest1Low,ZeroData); //»óÀ§ À§Ä¡ shortÇü º¯È¯ Dest1Low = _mm_unpacklo_epi8(Dest1Low,ZeroData); //ÇÏÀ§ À§Ä¡ shortÇü º¯È¯ Dest1High = _mm_mullo_epi16(Dest1High,SubPixel128); Dest1Low = _mm_mullo_epi16(Dest1Low,SubPixel128); Dest2Low = _mm_loadu_si128((__m128i*)(pSrc+i+(j-nPitch-1)*nImageWidth)); Dest2High = _mm_unpackhi_epi8(Dest2Low,ZeroData); //»óÀ§ À§Ä¡ shortÇü º¯È¯ Dest2Low = _mm_unpacklo_epi8(Dest2Low,ZeroData); //ÇÏÀ§ À§Ä¡ shortÇü º¯È¯ Dest2High = _mm_mullo_epi16(Dest2High,SubPixel2128); Dest2Low = _mm_mullo_epi16(Dest2Low,SubPixel2128); Dest1High = _mm_adds_epi16(Dest1High,Dest2High); Dest1Low = _mm_adds_epi16(Dest1Low,Dest2Low); Dest1High = _mm_srli_epi16(Dest1High,4); Dest1Low = _mm_srli_epi16(Dest1Low,4); Dest1High = _mm_subs_epi16(Dest1High,SrcHigh); Dest1Low = _mm_subs_epi16(Dest1Low,Srclow); Dest2High = Dest1High; Dest2Low = Dest1Low; } if(mask[0] ||mask[1]) { Dest1High = _mm_cmpgt_epi16(Dest1High,Thres128); Dest1Low = _mm_cmpgt_epi16(Dest1Low,Thres128); mask[0] = _mm_movemask_epi8(Dest1High); mask[1] = _mm_movemask_epi8(Dest1Low); if(mask[0] ||mask[1]) { ResultLow = _mm_packus_epi16(ResultLow,Resulthigh); _mm_storeu_si128( (__m128i*)(pDest+i+j*nImageWidth),ResultLow); continue; } } if(mask[2] ||mask[3]) { Dest1High = _mm_cmplt_epi16(Dest2High,NThres128); Dest1Low = _mm_cmplt_epi16(Dest2Low,NThres128); mask[2] = _mm_movemask_epi8(Dest1High); mask[3] = _mm_movemask_epi8(Dest1Low); if(mask[2] ||mask[3]) { ResultLow = _mm_packus_epi16(ResultLow,Resulthigh); _mm_storeu_si128( (__m128i*)(pDest+i+j*nImageWidth),ResultLow); continue; } } _mm_storeu_si128( (__m128i*)(pDest+i+j*nImageWidth),ZeroData); } } }