|
#include "stdafx.h"
|
|
#include "MatchImpl.h"
|
|
#include <emmintrin.h>
|
//#include <tmmintrin.h>
|
#include <ia32intrin.h>
|
|
|
#define MOSIS_SIMD
|
|
int64 CrossCorr_8u_C1(const uchar * vec1, const uchar * vec2, int len )
|
{
|
int i, s = 0;
|
int64 sum = 0;
|
|
#ifdef MOSIS_SIMD
|
__m128i mmResult, mmResult2;
|
__m128i mmA, mmB;
|
__m128i mmAA, mmBB;
|
__m128i mmZeroData= _mm_setzero_si128();
|
mmResult= _mm_setzero_si128();
|
|
int results[4];
|
|
for(i= 0; i <= len- 16; i+= 16)
|
{
|
mmA= _mm_loadu_si128((__m128i*) (vec1+ i));
|
mmB= _mm_loadu_si128((__m128i*) (vec2+ i));
|
|
// 1. »óÀ§ µ¥ÀÌÅÍ °è»ê.
|
// 1.1 »óÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù.
|
mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpackhi_epi8(mmB, mmZeroData);
|
|
mmResult2= _mm_madd_epi16(mmAA, mmBB);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
|
|
mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpacklo_epi8(mmB, mmZeroData);
|
|
mmResult2= _mm_madd_epi16(mmAA, mmBB);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
/*
|
// 1. »óÀ§ µ¥ÀÌÅÍ °è»ê.
|
// 1.1 »óÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù.
|
mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpackhi_epi8(mmB, mmZeroData);
|
mmResult2= _mm_mullo_epi16(mmAA, mmBB);
|
|
// 1.2 8°³ÀÇ 16bit µ¥ÀÌÅ͸¦ 4°³ÀÇ 32ºñÆ® µ¥ÀÌÅÍÇü µÎ°³·Î º¯È¯ÈÄ.. ´õÇÑ´Ù.
|
mmAA= _mm_unpackhi_epi16(mmResult2, mmZeroData);
|
mmBB= _mm_unpacklo_epi16(mmResult2, mmZeroData);
|
mmResult2= _mm_add_epi32(mmAA, mmBB);
|
// 1.3 ... ÃÑÇÕ¿¡ ´õÇÑ´Ù.
|
mmResult= _mm_add_epi32(mmResult, mmResult2);
|
|
|
|
// 2. ÇÏÀ§ µ¥ÀÌÅÍ °è»ê.
|
// 2.1 ÇÏÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù.
|
mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpacklo_epi8(mmB, mmZeroData);
|
mmResult2= _mm_mullo_epi16(mmAA, mmBB);
|
|
// 2.2 8°³ÀÇ 16bit µ¥ÀÌÅ͸¦ 4°³ÀÇ 32bit µ¥ÀÌÅÍÇü µÎ°³·Î º¯È¯ÈÄ.. ´õÇÑ´Ù.
|
mmAA= _mm_unpackhi_epi16(mmResult2, mmZeroData);
|
mmBB= _mm_unpacklo_epi16(mmResult2, mmZeroData);
|
mmResult2= _mm_add_epi32(mmAA, mmBB);
|
// 2.3 ... ÃÑÇÕ¿¡ ´õÇÑ´Ù.
|
mmResult= _mm_add_epi32(mmResult, mmResult2);
|
*/
|
}
|
|
_mm_storeu_si128((__m128i*)results, mmResult);
|
sum+= results[0]+ results[1]+ results[2]+ results[3];
|
|
|
#else
|
for( i = 0; i <= len - 4; i += 4 )
|
{
|
int e = vec1[i] * vec2[i];
|
int v = vec1[i + 1] * vec2[i + 1];
|
|
e += v;
|
v = vec1[i + 2] * vec2[i + 2];
|
e += v;
|
v = vec1[i + 3] * vec2[i + 3];
|
e += v;
|
sum += e;
|
}
|
#endif
|
|
for( ; i < len; i++ )
|
{
|
s += vec1[i] * vec2[i];
|
}
|
|
return sum + s;
|
}
|
|
int64 CrossCorr_8u_C1( CMosisBuffer buffer1, int x1, int y1, int width, int height, CMosisBuffer buffer2, int x2, int y2)
|
{
|
int64 sum = 0;
|
int h, w;
|
__m128i mmA, mmB;
|
__m128i mmAA, mmBB;
|
__m128i mmResult, mmResult2;
|
__m128i mmZeroData= _mm_setzero_si128();
|
mmResult= _mm_setzero_si128();
|
for(h= 0; h< height; h++)
|
{
|
for(w= 0; w < width- 16; w+= 16)
|
{
|
mmA= _mm_loadu_si128((__m128i*) buffer1.GetDataAddress(x1+ w, y1+ h));
|
mmB= _mm_loadu_si128((__m128i*) buffer2.GetDataAddress(x2+ w, y2+ h));
|
|
mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpackhi_epi8(mmB, mmZeroData);
|
|
mmResult2= _mm_madd_epi16(mmAA, mmBB);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
|
|
mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpacklo_epi8(mmB, mmZeroData);
|
|
mmResult2= _mm_madd_epi16(mmAA, mmBB);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
}
|
}
|
int results[4];
|
_mm_storeu_si128((__m128i*)results, mmResult);
|
sum= results[0]+ results[1]+ results[2]+ results[3];
|
return sum;
|
}
|
|
int64 CrossCorr_16u_C1( const ushort * vec1, const ushort * vec2, int len )
|
{
|
int i, s = 0;
|
int64 sum = 0;
|
|
#ifdef MOSIS_SIMD
|
__m128i mmResult, mmResult2;
|
__m128i mmA, mmB;
|
__m128i mmAA, mmBB;
|
mmResult= _mm_setzero_si128();
|
|
int results[4];
|
|
for(i= 0; i <= len- 8; i+= 8)
|
{
|
mmA= _mm_loadu_si128((__m128i*) (vec1+ i));
|
mmB= _mm_loadu_si128((__m128i*) (vec2+ i));
|
|
|
mmResult2= _mm_madd_epi16(mmA, mmB);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
}
|
|
_mm_storeu_si128((__m128i*)results, mmResult);
|
sum= results[0]+ results[1]+ results[2]+ results[3];
|
|
|
#else
|
for( i = 0; i <= len - 4; i += 4 )
|
{
|
int e = vec1[i] * vec2[i];
|
int v = vec1[i + 1] * vec2[i + 1];
|
|
e += v;
|
v = vec1[i + 2] * vec2[i + 2];
|
e += v;
|
v = vec1[i + 3] * vec2[i + 3];
|
e += v;
|
sum += e;
|
}
|
#endif
|
|
for( ; i < len; i++ )
|
{
|
s += vec1[i] * vec2[i];
|
}
|
|
return sum + s;
|
}
|
|
// À½ÀÇ °î¼±. ÃÖ¼Ò°ªÀÌ Á¸ÀçÇÑ´Ù.
|
float SearchMaximum(int64 *pData, int nData)
|
{
|
int64 best= 0;
|
int64 bestBefore, bestAfter;
|
int iBest;
|
float ret= 0;
|
for(int i= 0; i< nData; i++)
|
{
|
if(best < *(pData+ i))
|
{
|
iBest= i;
|
best= *(pData+ i);
|
}
|
}
|
if(iBest < 1 || iBest > nData- 2)
|
return -1;
|
|
bestBefore= *(pData+ iBest- 1);
|
bestAfter= *(pData+ iBest+ 1);
|
if(best == bestAfter)
|
{
|
ret= iBest+ 0.5;
|
}
|
else if(bestBefore < bestAfter)
|
{
|
ret= iBest+ 0.5- 0.5*(best- bestAfter)/(best- bestBefore);
|
}
|
else
|
{
|
ret= iBest- 0.5+ 0.5/(best- bestAfter)*(best- bestBefore);
|
}
|
|
if(ret > iBest+ 1 || ret < iBest- 1)
|
return -1;
|
|
return ret;
|
}
|
// À½ÀÇ °î¼±. ÃÖ¼Ò°ªÀÌ Á¸ÀçÇÑ´Ù.
|
float SearchMinimum(int64 *pData, int nData)
|
{
|
int64 best= *pData;
|
int64 bestBefore, bestAfter;
|
int iBest;
|
float ret= 0;
|
for(int i= 0; i< nData; i++)
|
{
|
if(best >= *(pData+ i))
|
{
|
iBest= i;
|
best= *(pData+ i);
|
}
|
}
|
if(iBest < 1 || iBest > nData- 2)
|
return -1;
|
|
bestBefore= *(pData+ iBest- 1);
|
bestAfter= *(pData+ iBest+ 1);
|
if(best == bestAfter)
|
{
|
ret= iBest+ 0.5;
|
}
|
else if(best == bestBefore)
|
{
|
ret= iBest- 0.5;
|
}
|
else if(bestBefore < bestAfter)
|
{
|
ret= iBest- 0.5+ 0.5/(best- bestAfter)*(best- bestBefore);
|
}
|
else// if(*(pBest-1) > bestAfter)
|
{
|
ret= iBest+ 0.5- 0.5*(best- bestAfter)/(best- bestBefore);
|
}
|
|
if(ret > iBest+ 1 || ret < iBest- 1)
|
return -1;
|
|
return ret;
|
}
|
|
AFX_EXT_API INT64 GetAbsoluteDifference(CMosisBuffer buffer, int x1, int y1, int width, int height, int x2, int y2)
|
{
|
// width= width>>4<<4;// 16À¸·Î ³ª´«´Ù.
|
|
__m128i mmA, mmB;
|
__m128i mmRes= _mm_setzero_si128();
|
int64 sum;
|
int64 results[2];
|
|
for(int y= 0; y< height; y++)
|
{
|
for(int x= 0; x<= width- 16; x+= 16)
|
{
|
mmA= _mm_loadu_si128((__m128i*) buffer.GetDataAddress(x1+ x, y1+ y));
|
mmB= _mm_loadu_si128((__m128i*) buffer.GetDataAddress(x2+ x, y2+ y));
|
mmA= _mm_sad_epu8(mmA, mmB);
|
mmRes= _mm_add_epi64(mmA, mmRes);
|
}
|
}
|
_mm_storeu_si128((__m128i*)results, mmRes);
|
sum= results[0]+ results[1];
|
return sum;
|
}
|
AFX_EXT_API INT64 GetAbsoluteDifference(CMosisBuffer buffer, int x1, int y1, int width, int height, CMosisBuffer buffer2, int x2, int y2)
|
{
|
// width= width>>4<<4;// 16À¸·Î ³ª´«´Ù.
|
|
__m128i mmA, mmB;
|
__m128i mmRes= _mm_setzero_si128();
|
int64 sum;
|
int64 results[2];
|
|
for(int y= 0; y< height; y++)
|
{
|
for(int x= 0; x<= width- 16; x+= 16)
|
{
|
mmA= _mm_loadu_si128((__m128i*) buffer.GetDataAddress(x1+ x, y1+ y));
|
mmB= _mm_loadu_si128((__m128i*) buffer2.GetDataAddress(x2+ x, y2+ y));
|
mmA= _mm_sad_epu8(mmA, mmB);
|
mmRes= _mm_add_epi64(mmA, mmRes);
|
}
|
}
|
_mm_storeu_si128((__m128i*)results, mmRes);
|
sum= results[0]+ results[1];
|
return sum;
|
}
|
|
AFX_EXT_API INT64 GetAbsoluteDifference(CMosisBuffer buffer, int x1, int y1, int width, int height, int x2, int y2, float subpixel)
|
{
|
width= width>>4;// 16À¸·Î ³ª´«´Ù.
|
|
__m128i mmZero= _mm_setzero_si128();
|
__m128i mmA, mmB;
|
__m128i mmAA, mmBB;
|
__m128i mmRes= _mm_setzero_si128();
|
int64 sum= 0;
|
int64 results[2];
|
|
for(int y= y1; y< y1+ height; y++)
|
{
|
for(int x= 0; x<= width- 16; x+= 16)
|
{
|
mmA= _mm_loadu_si128((__m128i*) buffer.GetDataAddress(x1+ x, y));
|
mmB= _mm_loadu_si128((__m128i*) buffer.GetDataAddress(x2+ x, y));
|
mmBB= _mm_loadu_si128((__m128i*) buffer.GetDataAddress(x2+ x+ 1, y));
|
mmB= _mm_sub_epi8(mmA, mmB);
|
mmBB= _mm_sub_epi8(mmA, mmBB);
|
}
|
}
|
return sum;
|
}
|
|
|
|
void VertConvSIMD(unsigned char *pSrc, unsigned char *pDest, unsigned int nImageWidth, unsigned int nImageHeight, float pitch, int Threshold)
|
{
|
int nPitch = pitch;
|
int SubPixel = (pitch-nPitch)*16;
|
|
|
short SrcValue = 0;
|
short DestValue = 0;
|
short DestValue1 = 0;
|
short DestValue2 = 0;
|
short SubValue = 0;
|
|
__m128i ZeroData = _mm_setzero_si128();
|
__m128i DataN = _mm_set1_epi16(-1);
|
|
__m128i SubPixel128 = _mm_set1_epi16(16-SubPixel);
|
__m128i SubPixel2128 = _mm_set1_epi16(SubPixel);
|
|
__m128i Thres128 = _mm_set1_epi16(Threshold);
|
__m128i NThres128 = _mm_set1_epi16(Threshold*-1);
|
|
__m128i SrcHigh = _mm_setzero_si128();
|
__m128i Srclow = _mm_setzero_si128();
|
__m128i Dest1High = _mm_setzero_si128();
|
__m128i Dest1Low = _mm_setzero_si128();
|
__m128i Dest2High = _mm_setzero_si128();
|
__m128i Dest2Low = _mm_setzero_si128();
|
|
__m128i Resulthigh = _mm_setzero_si128();
|
__m128i ResultLow = _mm_setzero_si128();
|
|
int mask[4] = {0};
|
int mask2 = 0;
|
|
//0~pitch °Å¸®±îÁö
|
for( int i = 0; i < nImageWidth; i+=16)
|
{
|
for( int j = nPitch+1; j < nImageHeight-nPitch-1; j++)
|
{
|
Srclow = _mm_loadu_si128((__m128i*)(pSrc+i+j*nImageWidth));
|
|
SrcHigh = _mm_unpackhi_epi8(Srclow,ZeroData);
|
//»óÀ§ À§Ä¡ shortÇü º¯È¯
|
Srclow = _mm_unpacklo_epi8(Srclow,ZeroData);
|
//ÇÏÀ§ À§Ä¡ shortÇü º¯È¯
|
|
Dest1Low = _mm_loadu_si128((__m128i*)(pSrc+i+(j+nPitch)*nImageWidth));
|
|
Dest1High = _mm_unpackhi_epi8(Dest1Low,ZeroData);
|
//»óÀ§ À§Ä¡ shortÇü º¯È¯
|
Dest1Low = _mm_unpacklo_epi8(Dest1Low,ZeroData);
|
//ÇÏÀ§ À§Ä¡ shortÇü º¯È¯
|
|
Dest1High = _mm_mullo_epi16(Dest1High,SubPixel128);
|
Dest1Low = _mm_mullo_epi16(Dest1Low,SubPixel128);
|
|
Dest2Low = _mm_loadu_si128((__m128i*)(pSrc+i+(j+nPitch+1)*nImageWidth));
|
|
Dest2High = _mm_unpackhi_epi8(Dest2Low,ZeroData);
|
//»óÀ§ À§Ä¡ shortÇü º¯È¯
|
Dest2Low = _mm_unpacklo_epi8(Dest2Low,ZeroData);
|
//ÇÏÀ§ À§Ä¡ shortÇü º¯È¯
|
|
Dest2High = _mm_mullo_epi16(Dest2High,SubPixel2128);
|
Dest2Low = _mm_mullo_epi16(Dest2Low,SubPixel2128);
|
|
Dest1High = _mm_adds_epi16(Dest1High,Dest2High);
|
Dest1Low = _mm_adds_epi16(Dest1Low,Dest2Low);
|
|
Dest1High = _mm_srli_epi16(Dest1High,4);
|
Dest1Low = _mm_srli_epi16(Dest1Low,4);
|
|
Dest1High = _mm_subs_epi16(Dest1High,SrcHigh);
|
Dest1Low = _mm_subs_epi16(Dest1Low,Srclow);
|
|
Dest2High = Dest1High;
|
Dest2Low = Dest1Low;
|
|
Resulthigh = Dest1High;
|
ResultLow = Dest1Low;
|
|
Dest1High = _mm_cmpgt_epi16(Dest1High,Thres128);
|
Dest1Low = _mm_cmpgt_epi16(Dest1Low,Thres128);
|
|
mask[0] = _mm_movemask_epi8(Dest1High);
|
mask[1] = _mm_movemask_epi8(Dest1Low);
|
|
Dest1High = _mm_cmplt_epi16(Dest2High,NThres128);
|
Dest1Low = _mm_cmplt_epi16(Dest2Low,NThres128);
|
|
mask[2] = _mm_movemask_epi8(Dest1High);
|
mask[3] = _mm_movemask_epi8(Dest1Low);
|
|
if(mask[0] ||mask[1] ||mask[2] ||mask[3] )
|
{
|
Dest1Low = _mm_loadu_si128((__m128i*)(pSrc+i+(j-nPitch)*nImageWidth));
|
|
Dest1High = _mm_unpackhi_epi8(Dest1Low,ZeroData);
|
//»óÀ§ À§Ä¡ shortÇü º¯È¯
|
Dest1Low = _mm_unpacklo_epi8(Dest1Low,ZeroData);
|
//ÇÏÀ§ À§Ä¡ shortÇü º¯È¯
|
|
Dest1High = _mm_mullo_epi16(Dest1High,SubPixel128);
|
Dest1Low = _mm_mullo_epi16(Dest1Low,SubPixel128);
|
|
|
Dest2Low = _mm_loadu_si128((__m128i*)(pSrc+i+(j-nPitch-1)*nImageWidth));
|
|
Dest2High = _mm_unpackhi_epi8(Dest2Low,ZeroData);
|
//»óÀ§ À§Ä¡ shortÇü º¯È¯
|
Dest2Low = _mm_unpacklo_epi8(Dest2Low,ZeroData);
|
//ÇÏÀ§ À§Ä¡ shortÇü º¯È¯
|
|
Dest2High = _mm_mullo_epi16(Dest2High,SubPixel2128);
|
Dest2Low = _mm_mullo_epi16(Dest2Low,SubPixel2128);
|
|
Dest1High = _mm_adds_epi16(Dest1High,Dest2High);
|
Dest1Low = _mm_adds_epi16(Dest1Low,Dest2Low);
|
|
Dest1High = _mm_srli_epi16(Dest1High,4);
|
Dest1Low = _mm_srli_epi16(Dest1Low,4);
|
|
Dest1High = _mm_subs_epi16(Dest1High,SrcHigh);
|
Dest1Low = _mm_subs_epi16(Dest1Low,Srclow);
|
|
Dest2High = Dest1High;
|
Dest2Low = Dest1Low;
|
}
|
|
if(mask[0] ||mask[1])
|
{
|
Dest1High = _mm_cmpgt_epi16(Dest1High,Thres128);
|
Dest1Low = _mm_cmpgt_epi16(Dest1Low,Thres128);
|
|
mask[0] = _mm_movemask_epi8(Dest1High);
|
mask[1] = _mm_movemask_epi8(Dest1Low);
|
|
if(mask[0] ||mask[1])
|
{
|
ResultLow = _mm_packus_epi16(ResultLow,Resulthigh);
|
_mm_storeu_si128( (__m128i*)(pDest+i+j*nImageWidth),ResultLow);
|
continue;
|
}
|
}
|
|
if(mask[2] ||mask[3])
|
{
|
Dest1High = _mm_cmplt_epi16(Dest2High,NThres128);
|
Dest1Low = _mm_cmplt_epi16(Dest2Low,NThres128);
|
|
mask[2] = _mm_movemask_epi8(Dest1High);
|
mask[3] = _mm_movemask_epi8(Dest1Low);
|
|
if(mask[2] ||mask[3])
|
{
|
ResultLow = _mm_packus_epi16(ResultLow,Resulthigh);
|
_mm_storeu_si128( (__m128i*)(pDest+i+j*nImageWidth),ResultLow);
|
continue;
|
}
|
}
|
|
_mm_storeu_si128( (__m128i*)(pDest+i+j*nImageWidth),ZeroData);
|
}
|
}
|
}
|