// SISMath.cpp: implementation of the CSISMath class.
|
//
|
//////////////////////////////////////////////////////////////////////
|
|
#include "stdafx.h"
|
#include "SISMath.h"
|
|
|
#include <emmintrin.h>
|
#include <tmmintrin.h>
|
#include <omp.h>
|
|
#ifdef _DEBUG
|
#define new DEBUG_NEW
|
#undef THIS_FILE
|
static char THIS_FILE[] = __FILE__;
|
#endif
|
|
//////////////////////////////////////////////////////////////////////
|
// Construction/Destruction
|
//////////////////////////////////////////////////////////////////////
|
|
#define SIS_SIMD // C++ ¹öÀü¿¡¼ Intrinsic ¹öÀüÀ¸·Î Àû¿ë ÇÔ.. C++ ¹öÀü »èÁ¦ °¡´É ÇϰÚÀ½..
|
|
//#define SIS_ASM // Intrinsic ¹öÀü¿¡¼ Assembly ¹öÀüÀ¸·Î Àû¿ëÇÔ.. ±¸Çö Å×½ºÆ® ÈÄ ½Ã°£Â÷ ¾ø¾î¼ IntrinsicÀ¸·Î Àû¿ëÇÔ.
|
|
//#define SIS_DEBUG_OMP // Intrinsic ¹öÀü¿¡¼ Open MP Àû¿ë
|
// OpenMP Àû¿ë ºÒ°¡.. ±¸ÇöÈÄ Å×½ºÆ®¿¡¼ ¼Óµµ ´À·ÁÁü.
|
// SIMD ´Ü¿¡ Àû¿ë ÇßÀ¸³ª ´ëºÎºÐÀÇ ÄÉÀ̽º¿¡¼ ½Ã°£ÀÌ 2¹è ¼Ò¿ä..
|
// Thread OverHead , Context switching¿øÀÎÀ¸·Î ÆÇ´ÜµÊ.
|
// Debug ¿¡¼ µ¿ÀÛÀº Çϳª Release¿¡¼ Á¤»óµ¿ÀÛ ¾ÈÇÔ.
|
|
|
|
#if defined(SIS_ASM)
|
extern "C"{
|
int64 _asm_Get_CCU8(const uchar * vec1, const uchar * vec2, int len, int flag);// 0= aa, 1=au, 2=uu
|
};
|
#endif
|
|
//#define SIS_DEBUG_ALIGNED_UNALIGNED
|
|
INT64 CSISMath::GetSum(CSISBuffer buffer)
|
{
|
__m128i mmA, mmB= _mm_setzero_si128();
|
__m128i mmRes= _mm_setzero_si128();
|
int64 sum= 0;
|
int64 results[2];
|
|
int x, y, width= buffer.GetWidth(), height= buffer.GetHeight();
|
for(y= 0; y< height; y++)
|
{
|
for(x= 0; x< width; x+= 16)
|
{
|
mmA= _mm_loadu_si128((__m128i*) buffer.GetDataAddress(x, y));
|
mmA= _mm_sad_epu8(mmA, mmB);
|
mmRes= _mm_add_epi64(mmA, mmRes);
|
}
|
//16À¸·Î ³ª´« ³ª¸ÓÁö ¿µ¿ª ó¸®.
|
for( ; x < width; x++ )
|
{
|
sum += buffer.GetPixel(x, y);
|
}
|
}
|
_mm_storeu_si128((__m128i*)results, mmRes);
|
sum= results[0]+ results[1]+ sum;
|
return sum;
|
}
|
|
double CSISMath::GetAverage(CSISBuffer buffer)
|
{
|
return (double)(GetSum(buffer))/(double)(buffer.GetWidth()*buffer.GetHeight());
|
}
|
INT64 CSISMath::GetAbsDiff(BYTE* pData, BYTE* pData2, int len)
|
{
|
__m128i mmA, mmB;
|
__m128i mmRes= _mm_setzero_si128();
|
int64 sum= 0;
|
int64 results[2];
|
|
int x= 0;
|
for( x= 0; x<= len- 16; x+= 16)
|
{
|
mmA= _mm_loadu_si128((__m128i*) pData+ x);
|
mmB= _mm_loadu_si128((__m128i*) pData2+ x);
|
mmA= _mm_sad_epu8(mmA, mmB);
|
mmRes= _mm_add_epi64(mmA, mmRes);
|
}
|
_mm_storeu_si128((__m128i*)results, mmRes);
|
for( x= 0; x< len; x++)
|
sum+= *(pData+len);
|
sum= results[0]+ results[1]+ sum;
|
return sum;
|
|
}
|
INT64 CSISMath::GetAbsDiff(CSISBuffer buffer, int x1, int y1, int width, int height, CSISBuffer buffer2, int x2, int y2)
|
{
|
// width= width>>4<<4;// 16À¸·Î ³ª´«´Ù.
|
|
__m128i mmA, mmB;
|
__m128i mmRes= _mm_setzero_si128();
|
int64 sum= 0;
|
int64 results[2];
|
// mmA= _mm_abs_epi16(mmA); // pabsw xmm0,xmm0
|
// mmA= _mm_hadd_epi16(mmA, mmB); // phaddw xmm0,xmmword ptr [rsp+30h]
|
// mmA= _mm_madd_epi16(mmA, mmB); // pmaddwd xmm0,xmmword ptr [rsp+30h]
|
|
int x;
|
for(int y= 0; y< height; y++)
|
{
|
for( x= 0; x<= width- 16; x+= 16)
|
{
|
mmA= _mm_loadu_si128((__m128i*) buffer.GetDataAddress(x1+ x, y1+ y));
|
mmB= _mm_loadu_si128((__m128i*) buffer2.GetDataAddress(x2+ x, y2+ y));
|
mmA= _mm_sad_epu8(mmA, mmB);
|
mmRes= _mm_add_epi64(mmA, mmRes);
|
}
|
// 16À¸·Î ³ª´« ³ª¸ÓÁö ¿µ¿ª ó¸®.
|
// for( ; x < width; x++ )
|
// {
|
// sum += abs(buffer.GetPixel(x1+ x, y1+ y) - buffer2.GetPixel(x2+ x, y2+ y));
|
// }
|
}
|
_mm_storeu_si128((__m128i*)results, mmRes);
|
sum= results[0]+ results[1]+ sum;
|
return sum;
|
}
|
|
|
// ¾çÀÇ °î¼±. ÃÖ´ë°ªÀÌ Á¸ÀçÇÑ´Ù.
|
double CSISMath::SearchMaximum(int64 *pData, int nData)
|
{
|
if(pData == NULL)
|
return -1;
|
if(nData < 2)
|
return -1;
|
int64 best= *pData;
|
int64 bestBefore, bestAfter;
|
int iBest= 0;
|
double ret= 0;
|
for(int i= 1; i< nData; i++)
|
{
|
if(best < *(pData+ i))
|
{
|
iBest= i;
|
best= *(pData+ i);
|
}
|
}
|
if(iBest < 1 || iBest > nData- 2)
|
return -1;
|
|
bestBefore= *(pData+ iBest- 1);
|
bestAfter= *(pData+ iBest+ 1);
|
if((best- bestAfter) == 0)
|
{
|
ret= iBest+ 0.5;
|
}
|
else if((best- bestBefore) == 0)
|
{
|
ret= iBest- 0.5;
|
}
|
else if(bestBefore < bestAfter)
|
{
|
ret= iBest+ 0.5- 0.5*(best- bestAfter)/(best- bestBefore);
|
}
|
else
|
{
|
ret= iBest- 0.5+ 0.5/(best- bestAfter)*(best- bestBefore);
|
}
|
|
if(ret > iBest+ 1 || ret < iBest- 1)
|
return -1;
|
|
return ret;
|
}
|
// À½ÀÇ °î¼±. ÃÖ¼Ò°ªÀÌ Á¸ÀçÇÑ´Ù.
|
double CSISMath::SearchMinimum(int64 *pData, int nData)
|
{
|
if(pData == NULL)
|
return -1;
|
if(nData < 2)
|
return -1;
|
int64 best= *pData;
|
int64 bestBefore, bestAfter;
|
int iBest= 0;
|
double ret= 0;
|
for(int i= 1; i< nData; i++)
|
{
|
if(best > *(pData+ i))
|
{
|
iBest= i;
|
best= *(pData+ i);
|
}
|
}
|
if(iBest < 1 || iBest > nData- 2)
|
return -1;
|
|
bestBefore= *(pData+ iBest- 1);
|
bestAfter= *(pData+ iBest+ 1);
|
if((best- bestAfter) == 0)
|
{
|
ret= iBest+ 0.5;
|
}
|
else if((best- bestBefore) == 0)
|
{
|
ret= iBest- 0.5;
|
}
|
else if(bestBefore < bestAfter)
|
{
|
ret= iBest- 0.5+ 0.5/(best- bestAfter)*(best- bestBefore);
|
}
|
else// if(*(pBest-1) > bestAfter)
|
{
|
ret= iBest+ 0.5- 0.5*(best- bestAfter)/(best- bestBefore);
|
}
|
|
if(ret > iBest+ 1 || ret < iBest- 1)
|
return -1;
|
|
return ret;
|
}
|
#define SIS_SIMD_INT64
|
|
int64 CSISMath::SumPixels_8u( const uchar * vec, int len )
|
{
|
int i= 0, s = 0;
|
int64 sum = 0;
|
#if defined(SIS_SIMD)
|
int results[4];
|
|
__m128i mmResult;
|
__m128i mmA, mmB;
|
__m128i mmZeroData= _mm_setzero_si128();
|
mmResult= _mm_setzero_si128();
|
__m128i mmOne= _mm_set1_epi16(1);
|
|
|
|
for(i= 0; i<= len- 16; i+= 16)
|
{
|
mmA= _mm_loadu_si128((__m128i*) (vec+ i));
|
mmB= _mm_unpackhi_epi8(mmA, mmZeroData);
|
mmA= _mm_unpacklo_epi8(mmA, mmZeroData);
|
mmA= _mm_add_epi16(mmA, mmB);
|
mmA= _mm_madd_epi16(mmA, mmOne);
|
mmResult= _mm_add_epi32(mmA, mmResult);
|
}
|
_mm_storeu_si128((__m128i*)results, mmResult);
|
sum= (int64)results[0]+ results[1]+ results[2]+ results[3];
|
|
#else
|
for( i = 0; i <= len - 4; i += 4 )
|
{
|
sum += vec[i] + vec[i + 1] + vec[i + 2] + vec[i + 3];
|
}
|
#endif
|
|
for( ; i < len; i++ )
|
{
|
s += vec[i];
|
}
|
|
return sum + s;
|
|
}
|
int64 CSISMath::SumPixels_16u( const ushort * vec, int len )
|
{
|
int64 sum = 0;
|
int i, s = 0;
|
int64 sum2= 0;
|
#if defined(SIS_SIMD)
|
UINT32 results[4];
|
|
__m128i mmResult;
|
__m128i mmA;
|
__m128i mmZeroData= _mm_setzero_si128();
|
mmResult= _mm_setzero_si128();
|
__m128i mmOne= _mm_set1_epi16(1);
|
for(i= 0; i<= len- 8; i+= 8)
|
{
|
mmA= _mm_loadu_si128((__m128i*) (vec+ i));
|
// mmA= _mm_add_epi16(mmA, mmB);
|
mmA= _mm_madd_epi16(mmA, mmOne);
|
mmResult= _mm_add_epi32(mmA, mmResult);
|
}
|
_mm_storeu_si128((__m128i*)results, mmResult);
|
sum= (int64)results[0]+ results[1]+ results[2]+ results[3];
|
|
#else
|
for( i = 0; i <= len - 8; i += 8 )
|
{
|
sum += vec[i] + vec[i + 1] + vec[i + 2] + vec[i + 3]
|
+ vec[i+ 4] + vec[i + 5] + vec[i + 6] + vec[i + 7];
|
}
|
#endif
|
for( ; i < len; i++ )
|
{
|
s += vec[i];
|
}
|
|
return sum + s;
|
}
|
|
int64 CSISMath::Get_CCU8_A(const uchar * vec1, int len )
|
{
|
#if defined(SIS_DEBUG_OMP)
|
int64 rslt= OMP_CCU8_A(vec1, len);
|
return rslt;
|
#endif
|
|
#if defined(SIS_ASM)
|
return _asm_Get_CCU8(vec1, NULL, len, 0);
|
#else
|
#if defined(SIS_DEBUG_ALIGNED_UNALIGNED)
|
return Get_CCU8_UA(vec1, vec1, len);
|
#endif
|
int i, s = 0;
|
int64 sum = 0;
|
|
#ifdef SIS_SIMD
|
__m128i mmResult, mmResult2;
|
__m128i mmA;
|
__m128i mmAA, mmBB;
|
__m128i mmZeroData= _mm_setzero_si128();
|
|
#ifdef SIS_SIMD_INT64
|
int64 res64[2];
|
mmResult= _mm_setzero_si128();
|
for(i= 0; i <= len- 16; i+= 16)
|
{
|
mmA= _mm_load_si128((__m128i*) (vec1+ i));
|
|
mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
|
mmResult2= _mm_madd_epi16(mmAA, mmAA);
|
mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
|
mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
|
mmResult= _mm_add_epi64(mmAA, mmResult);
|
mmResult= _mm_add_epi64(mmBB, mmResult);
|
|
mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
|
mmResult2= _mm_madd_epi16(mmAA, mmAA);
|
mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
|
mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
|
mmResult= _mm_add_epi64(mmAA, mmResult);
|
mmResult= _mm_add_epi64(mmBB, mmResult);
|
}
|
_mm_storeu_si128((__m128i*)res64, mmResult);
|
sum= res64[0]+ res64[1];
|
|
#else
|
|
UINT32 results[4];
|
|
mmResult= _mm_setzero_si128();
|
for(i= 0; i <= len- 16; i+= 16)
|
{
|
|
mmA= _mm_load_si128((__m128i*) (vec1+ i));
|
// mmB= _mm_load_si128((__m128i*) (vec2+ i));
|
|
// 1. »óÀ§ µ¥ÀÌÅÍ °è»ê.
|
// 1.1 »óÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù.
|
mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
|
// mmBB= _mm_unpackhi_epi8(mmB, mmZeroData);
|
|
mmResult2= _mm_madd_epi16(mmAA, mmAA);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
|
|
mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
|
// mmBB= _mm_unpacklo_epi8(mmB, mmZeroData);
|
|
mmResult2= _mm_madd_epi16(mmAA, mmAA);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
}
|
|
_mm_storeu_si128((__m128i*)results, mmResult);
|
sum= results[0]+ results[1]+ results[2]+ results[3];
|
#endif
|
|
#else
|
for( i = 0; i <= len - 4; i += 4 )
|
{
|
int e = vec1[i] * vec1[i];
|
int v = vec1[i + 1] * vec1[i + 1];
|
|
e += v;
|
v = vec1[i + 2] * vec1[i + 2];
|
e += v;
|
v = vec1[i + 3] * vec1[i + 3];
|
e += v;
|
sum += e;
|
}
|
#endif
|
|
for( ; i < len; i++ )
|
{
|
s += vec1[i] * vec1[i];
|
}
|
|
return sum + s;
|
#endif
|
}
|
|
|
int64 CSISMath::Get_CCU8_AUA(const uchar * vec1, const uchar * vec2, int len )
|
{
|
#if defined(SIS_DEBUG_OMP)
|
int64 rslt= OMP_CCU8_AUA(vec1, vec2, len);
|
return rslt;
|
#endif
|
#if defined(SIS_ASM)
|
return _asm_Get_CCU8(vec1, vec2, len, 1);
|
#else
|
#if defined(SIS_DEBUG_ALIGNED_UNALIGNED)
|
return Get_CCU8_UA(vec1, vec2, len);
|
#endif
|
int i, s = 0;
|
int64 sum = 0;
|
|
#ifdef SIS_SIMD
|
__m128i mmResult, mmResult2;
|
__m128i mmA, mmB;
|
__m128i mmAA, mmBB;
|
__m128i mmZeroData= _mm_setzero_si128();
|
mmResult= _mm_setzero_si128();
|
#ifdef SIS_SIMD_INT64
|
int64 res64[2];
|
for(i= 0; i <= len- 16; i+= 16)
|
{
|
|
mmA= _mm_load_si128((__m128i*) (vec1+ i));
|
mmB= _mm_loadu_si128((__m128i*) (vec2+ i));
|
|
mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpackhi_epi8(mmB, mmZeroData);
|
mmResult2= _mm_madd_epi16(mmAA, mmBB);
|
mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
|
mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
|
mmResult= _mm_add_epi64(mmAA, mmResult);
|
mmResult= _mm_add_epi64(mmBB, mmResult);
|
|
mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpacklo_epi8(mmB, mmZeroData);
|
mmResult2= _mm_madd_epi16(mmAA, mmBB);
|
mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
|
mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
|
mmResult= _mm_add_epi64(mmAA, mmResult);
|
mmResult= _mm_add_epi64(mmBB, mmResult);
|
}
|
_mm_storeu_si128((__m128i*)res64, mmResult);
|
sum= res64[0]+ res64[1];
|
#else
|
|
UINT32 results[4];
|
|
for(i= 0; i <= len- 16; i+= 16)
|
{
|
|
mmA= _mm_load_si128((__m128i*) (vec1+ i));
|
mmB= _mm_loadu_si128((__m128i*) (vec2+ i));
|
|
// 1. »óÀ§ µ¥ÀÌÅÍ °è»ê.
|
// 1.1 »óÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù.
|
mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpackhi_epi8(mmB, mmZeroData);
|
|
mmResult2= _mm_madd_epi16(mmAA, mmBB);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
|
|
mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpacklo_epi8(mmB, mmZeroData);
|
|
mmResult2= _mm_madd_epi16(mmAA, mmBB);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
}
|
|
_mm_storeu_si128((__m128i*)results, mmResult);
|
sum+= results[0]+ results[1]+ results[2]+ results[3];
|
#endif
|
|
#else
|
for( i = 0; i <= len - 4; i += 4 )
|
{
|
int e = vec1[i] * vec2[i];
|
int v = vec1[i + 1] * vec2[i + 1];
|
|
e += v;
|
v = vec1[i + 2] * vec2[i + 2];
|
e += v;
|
v = vec1[i + 3] * vec2[i + 3];
|
e += v;
|
sum += e;
|
}
|
#endif
|
|
for( ; i < len; i++ )
|
{
|
s += vec1[i] * vec2[i];
|
}
|
|
return sum + s;
|
#endif
|
}
|
|
int64 CSISMath::Get_CCU8_UA(const uchar * vec1, const uchar * vec2, int len )
|
{
|
#if defined(SIS_ASM)
|
return _asm_Get_CCU8(vec1, vec2, len, 2);
|
#else
|
int i, s = 0;
|
int64 sum = 0;
|
|
#ifdef SIS_SIMD
|
__m128i mmResult, mmResult2;
|
__m128i mmA, mmB;
|
__m128i mmAA, mmBB;
|
__m128i mmZeroData= _mm_setzero_si128();
|
mmResult= _mm_setzero_si128();
|
|
#ifdef SIS_SIMD_INT64
|
|
#endif
|
|
UINT32 results[4];
|
|
for(i= 0; i <= len- 16; i+= 16)
|
{
|
|
mmA= _mm_loadu_si128((__m128i*) (vec1+ i));
|
mmB= _mm_loadu_si128((__m128i*) (vec2+ i));
|
|
// 1. »óÀ§ µ¥ÀÌÅÍ °è»ê.
|
// 1.1 »óÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù.
|
mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpackhi_epi8(mmB, mmZeroData);
|
|
mmResult2= _mm_madd_epi16(mmAA, mmBB);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
|
|
mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpacklo_epi8(mmB, mmZeroData);
|
|
mmResult2= _mm_madd_epi16(mmAA, mmBB);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
/*
|
// 1. »óÀ§ µ¥ÀÌÅÍ °è»ê.
|
// 1.1 »óÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù.
|
mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpackhi_epi8(mmB, mmZeroData);
|
mmResult2= _mm_mullo_epi16(mmAA, mmBB);
|
|
// 1.2 8°³ÀÇ 16bit µ¥ÀÌÅ͸¦ 4°³ÀÇ 32ºñÆ® µ¥ÀÌÅÍÇü µÎ°³·Î º¯È¯ÈÄ.. ´õÇÑ´Ù.
|
mmAA= _mm_unpackhi_epi16(mmResult2, mmZeroData);
|
mmBB= _mm_unpacklo_epi16(mmResult2, mmZeroData);
|
mmResult2= _mm_add_epi32(mmAA, mmBB);
|
// 1.3 ... ÃÑÇÕ¿¡ ´õÇÑ´Ù.
|
mmResult= _mm_add_epi32(mmResult, mmResult2);
|
|
|
|
// 2. ÇÏÀ§ µ¥ÀÌÅÍ °è»ê.
|
// 2.1 ÇÏÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù.
|
mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpacklo_epi8(mmB, mmZeroData);
|
mmResult2= _mm_mullo_epi16(mmAA, mmBB);
|
|
// 2.2 8°³ÀÇ 16bit µ¥ÀÌÅ͸¦ 4°³ÀÇ 32bit µ¥ÀÌÅÍÇü µÎ°³·Î º¯È¯ÈÄ.. ´õÇÑ´Ù.
|
mmAA= _mm_unpackhi_epi16(mmResult2, mmZeroData);
|
mmBB= _mm_unpacklo_epi16(mmResult2, mmZeroData);
|
mmResult2= _mm_add_epi32(mmAA, mmBB);
|
// 2.3 ... ÃÑÇÕ¿¡ ´õÇÑ´Ù.
|
mmResult= _mm_add_epi32(mmResult, mmResult2);
|
*/
|
}
|
|
_mm_storeu_si128((__m128i*)results, mmResult);
|
sum+= (int64)results[0]+ results[1]+ results[2]+ results[3];
|
|
|
#else
|
for( i = 0; i <= len - 4; i += 4 )
|
{
|
int e = vec1[i] * vec2[i];
|
int v = vec1[i + 1] * vec2[i + 1];
|
|
e += v;
|
v = vec1[i + 2] * vec2[i + 2];
|
e += v;
|
v = vec1[i + 3] * vec2[i + 3];
|
e += v;
|
sum += e;
|
}
|
#endif
|
|
for( ; i < len; i++ )
|
{
|
s += vec1[i] * vec2[i];
|
}
|
|
return sum + s;
|
#endif
|
}
|
|
int64 CSISMath::Get_CCU8_UA(const uchar * vec1, int len )
|
{
|
#if defined(SIS_DEBUG_OMP)
|
int64 rslt= OMP_CCU8_UA(vec1, len);
|
return rslt;
|
#endif
|
|
#if defined(SIS_ASM)
|
return _asm_Get_CCU8(vec1, NULL, len, 2);
|
#else
|
int i, s = 0;
|
int64 sum = 0;
|
|
#ifdef SIS_SIMD
|
__m128i mmResult, mmResult2;
|
__m128i mmA;
|
__m128i mmAA, mmBB;
|
__m128i mmZeroData= _mm_setzero_si128();
|
|
#ifdef SIS_SIMD_INT64
|
int64 res64[2];
|
mmResult= _mm_setzero_si128();
|
for(i= 0; i <= len- 16; i+= 16)
|
{
|
|
mmA= _mm_loadu_si128((__m128i*) (vec1+ i));
|
|
mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
|
mmResult2= _mm_madd_epi16(mmAA, mmAA);
|
mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
|
mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
|
mmResult= _mm_add_epi64(mmAA, mmResult);
|
mmResult= _mm_add_epi64(mmBB, mmResult);
|
|
mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
|
mmResult2= _mm_madd_epi16(mmAA, mmAA);
|
mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
|
mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
|
mmResult= _mm_add_epi64(mmAA, mmResult);
|
mmResult= _mm_add_epi64(mmBB, mmResult);
|
}
|
_mm_storeu_si128((__m128i*)res64, mmResult);
|
sum= res64[0]+ res64[1];
|
|
#else
|
|
UINT32 results[4];
|
|
mmResult= _mm_setzero_si128();
|
for(i= 0; i <= len- 16; i+= 16)
|
{
|
|
mmA= _mm_loadu_si128((__m128i*) (vec1+ i));
|
// mmB= _mm_loadu_si128((__m128i*) (vec2+ i));
|
|
// 1. »óÀ§ µ¥ÀÌÅÍ °è»ê.
|
// 1.1 »óÀ§ 8°³ÀÇ 8bit µ¥ÀÌÅ͸¦ 8°³ÀÇ 16bit µ¥ÀÌÅÍ·Î º¯È¯ÈÄ.. °öÇÑ´Ù.
|
mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
|
// mmBB= _mm_unpackhi_epi8(mmB, mmZeroData);
|
|
mmResult2= _mm_madd_epi16(mmAA, mmAA);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
|
|
mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
|
// mmBB= _mm_unpacklo_epi8(mmB, mmZeroData);
|
|
mmResult2= _mm_madd_epi16(mmAA, mmAA);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
|
}
|
|
_mm_storeu_si128((__m128i*)results, mmResult);
|
sum+= results[0]+ results[1]+ results[2]+ results[3];
|
#endif
|
|
#else
|
for( i = 0; i <= len - 4; i += 4 )
|
{
|
int e = vec1[i] * vec1[i];
|
int v = vec1[i + 1] * vec1[i + 1];
|
|
e += v;
|
v = vec1[i + 2] * vec1[i + 2];
|
e += v;
|
v = vec1[i + 3] * vec1[i + 3];
|
e += v;
|
sum += e;
|
}
|
#endif
|
|
for( ; i < len; i++ )
|
{
|
s += vec1[i] * vec1[i];
|
}
|
|
return sum + s;
|
#endif
|
}
|
|
|
int64 CSISMath::Get_CCU8( CSISBuffer buffer1, int x1, int y1, int width, int height, CSISBuffer buffer2, int x2, int y2)
|
{
|
int64 sum = 0;
|
int64 sum2= 0;
|
int h, w;
|
__m128i mmA, mmB;
|
__m128i mmAA, mmBB;
|
__m128i mmResult, mmResult2;
|
__m128i mmZeroData= _mm_setzero_si128();
|
mmResult= _mm_setzero_si128();
|
|
|
for(h= 0; h< height; h++)
|
{
|
for(w= 0; w < width- 16; w+= 16)
|
{
|
mmA= _mm_loadu_si128((__m128i*) buffer1.GetDataAddress(x1+ w, y1+ h));
|
mmB= _mm_loadu_si128((__m128i*) buffer2.GetDataAddress(x2+ w, y2+ h));
|
|
mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpackhi_epi8(mmB, mmZeroData);
|
|
mmResult2= _mm_madd_epi16(mmAA, mmBB);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
|
|
mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
|
mmBB= _mm_unpacklo_epi8(mmB, mmZeroData);
|
|
mmResult2= _mm_madd_epi16(mmAA, mmBB);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
}
|
// for(; w< width; w++)
|
// {
|
// sum2+= buffer1.GetPixel(x1+ w, y1+ h)*buffer2.GetPixel(x2+ w, y2+ h);
|
// }
|
}
|
UINT32 results[4];
|
_mm_storeu_si128((__m128i*)results, mmResult);
|
sum= (int64)results[0]+ results[1]+ results[2]+ results[3]+ sum2;
|
return sum;
|
}
|
|
int64 CSISMath::Get_CCU16_A( const ushort * vec1, const ushort * vec2, int len )
|
{
|
#if defined(SIS_DEBUG_ALIGNED_UNALIGNED)
|
return Get_CCU16_UA(vec1, vec2, len);
|
#endif
|
int i, s = 0;
|
int64 sum = 0;
|
|
#ifdef SIS_SIMD
|
__m128i mmResult, mmResult2;
|
__m128i mmA, mmB;
|
mmResult= _mm_setzero_si128();
|
|
UINT32 results[4];
|
|
for(i= 0; i <= len- 8; i+= 8)
|
{
|
mmA= _mm_load_si128((__m128i*) (vec1+ i));
|
mmB= _mm_load_si128((__m128i*) (vec2+ i));
|
|
|
mmResult2= _mm_madd_epi16(mmA, mmB);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
}
|
|
_mm_storeu_si128((__m128i*)results, mmResult);
|
sum= (int64)results[0]+ results[1]+ results[2]+ results[3];
|
|
|
#else
|
for( i = 0; i <= len - 4; i += 4 )
|
{
|
int e = vec1[i] * vec2[i];
|
int v = vec1[i + 1] * vec2[i + 1];
|
|
e += v;
|
v = vec1[i + 2] * vec2[i + 2];
|
e += v;
|
v = vec1[i + 3] * vec2[i + 3];
|
e += v;
|
sum += e;
|
}
|
#endif
|
|
for( ; i < len; i++ )
|
{
|
s += vec1[i] * vec2[i];
|
}
|
|
return sum + s;
|
}
|
|
int64 CSISMath::Get_CCU16_A( const ushort * vec1, int len )
|
{
|
#if defined(SIS_DEBUG_ALIGNED_UNALIGNED)
|
return Get_CCU16_UA(vec1, vec1, len);
|
#endif
|
int i, s = 0;
|
int64 sum = 0;
|
|
#ifdef SIS_SIMD
|
__m128i mmResult, mmResult2;
|
__m128i mmA;
|
mmResult= _mm_setzero_si128();
|
|
UINT32 results[4];
|
|
for(i= 0; i <= len- 8; i+= 8)
|
{
|
mmA= _mm_load_si128((__m128i*) (vec1+ i));
|
// mmB= _mm_load_si128((__m128i*) (vec2+ i));
|
|
|
mmResult2= _mm_madd_epi16(mmA, mmA);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
}
|
|
_mm_storeu_si128((__m128i*)results, mmResult);
|
sum= (int64)results[0]+ results[1]+ results[2]+ results[3];
|
|
|
#else
|
for( i = 0; i <= len - 4; i += 4 )
|
{
|
int e = vec1[i] * vec1[i];
|
int v = vec1[i + 1] * vec1[i + 1];
|
|
e += v;
|
v = vec1[i + 2] * vec1[i + 2];
|
e += v;
|
v = vec1[i + 3] * vec1[i + 3];
|
e += v;
|
sum += e;
|
}
|
#endif
|
|
for( ; i < len; i++ )
|
{
|
s += vec1[i] * vec1[i];
|
}
|
|
return sum + s;
|
}
|
|
int64 CSISMath::Get_CCU16_UA( const ushort * vec1, const ushort * vec2, int len )
|
{
|
int i, s = 0;
|
int64 sum = 0;
|
|
#ifdef SIS_SIMD
|
__m128i mmResult, mmResult2;
|
__m128i mmA, mmB;
|
mmResult= _mm_setzero_si128();
|
|
UINT32 results[4];
|
|
for(i= 0; i <= len- 8; i+= 8)
|
{
|
mmA= _mm_loadu_si128((__m128i*) (vec1+ i));
|
mmB= _mm_loadu_si128((__m128i*) (vec2+ i));
|
|
|
mmResult2= _mm_madd_epi16(mmA, mmB);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
}
|
|
_mm_storeu_si128((__m128i*)results, mmResult);
|
sum= (int64)results[0]+ results[1]+ results[2]+ results[3];
|
|
|
#else
|
for( i = 0; i <= len - 4; i += 4 )
|
{
|
int e = vec1[i] * vec2[i];
|
int v = vec1[i + 1] * vec2[i + 1];
|
|
e += v;
|
v = vec1[i + 2] * vec2[i + 2];
|
e += v;
|
v = vec1[i + 3] * vec2[i + 3];
|
e += v;
|
sum += e;
|
}
|
#endif
|
|
for( ; i < len; i++ )
|
{
|
s += vec1[i] * vec2[i];
|
}
|
|
return sum + s;
|
}
|
|
int64 CSISMath::Get_CCU16_UA( const ushort * vec1, int len )
|
{
|
int i, s = 0;
|
int64 sum = 0;
|
|
#ifdef SIS_SIMD
|
__m128i mmResult, mmResult2;
|
__m128i mmA;
|
mmResult= _mm_setzero_si128();
|
|
UINT32 results[4];
|
|
for(i= 0; i <= len- 8; i+= 8)
|
{
|
mmA= _mm_loadu_si128((__m128i*) (vec1+ i));
|
|
|
mmResult2= _mm_madd_epi16(mmA, mmA);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
}
|
|
_mm_storeu_si128((__m128i*)results, mmResult);
|
sum= (int64)results[0]+ results[1]+ results[2]+ results[3];
|
|
|
#else
|
for( i = 0; i <= len - 4; i += 4 )
|
{
|
int e = vec1[i] * vec1[i];
|
int v = vec1[i + 1] * vec1[i + 1];
|
|
e += v;
|
v = vec1[i + 2] * vec1[i + 2];
|
e += v;
|
v = vec1[i + 3] * vec1[i + 3];
|
e += v;
|
sum += e;
|
}
|
#endif
|
|
for( ; i < len; i++ )
|
{
|
s += vec1[i] * vec1[i];
|
}
|
|
return sum + s;
|
}
|
|
|
int64 CSISMath::Get_CCU16_AUA( const ushort * vec1, const ushort * vec2, int len )
|
{
|
#if defined(SIS_DEBUG_ALIGNED_UNALIGNED)
|
return Get_CCU16_UA(vec1, vec2, len);
|
#endif
|
int i, s = 0;
|
int64 sum = 0;
|
|
#ifdef SIS_SIMD
|
__m128i mmResult, mmResult2;
|
__m128i mmA, mmB;
|
mmResult= _mm_setzero_si128();
|
|
UINT32 results[4];
|
|
for(i= 0; i <= len- 8; i+= 8)
|
{
|
mmA= _mm_load_si128((__m128i*) (vec1+ i));
|
mmB= _mm_loadu_si128((__m128i*) (vec2+ i));
|
|
|
mmResult2= _mm_madd_epi16(mmA, mmB);
|
mmResult= _mm_add_epi32(mmResult2, mmResult);
|
}
|
|
_mm_storeu_si128((__m128i*)results, mmResult);
|
sum= (int64)results[0]+ results[1]+ results[2]+ results[3];
|
|
|
#else
|
for( i = 0; i <= len - 4; i += 4 )
|
{
|
int e = vec1[i] * vec2[i];
|
int v = vec1[i + 1] * vec2[i + 1];
|
|
e += v;
|
v = vec1[i + 2] * vec2[i + 2];
|
e += v;
|
v = vec1[i + 3] * vec2[i + 3];
|
e += v;
|
sum += e;
|
}
|
#endif
|
|
for( ; i < len; i++ )
|
{
|
s += vec1[i] * vec2[i];
|
}
|
|
return sum + s;
|
}
|