// SISMath.cpp: implementation of the CSISMath class. // ////////////////////////////////////////////////////////////////////// #include "stdafx.h" #include "SISMath.h" #include #include #include #ifdef _DEBUG #define new DEBUG_NEW #undef THIS_FILE static char THIS_FILE[] = __FILE__; #endif ////////////////////////////////////////////////////////////////////// // Construction/Destruction ////////////////////////////////////////////////////////////////////// #define SIS_SIMD #define MOMP_MAX_THREAD_SIZE 4 //#define SIS_ASM int64 CSISMath::OMP_CCU8_A(const uchar * vec1, int len) { int s = 0; int64 sum = 0; int64 sumT[MOMP_MAX_THREAD_SIZE] = {0}; #pragma omp parallel num_threads(MOMP_MAX_THREAD_SIZE) { int i= 0; int templen= len- 16; __m128i mmResult, mmResult2; __m128i mmA; __m128i mmAA, mmBB; __m128i mmZeroData= _mm_setzero_si128(); int64 res64[2]; mmResult= _mm_setzero_si128(); #pragma omp for private(i) for(i= 0; i <= templen; i+= 16) { mmA= _mm_load_si128((__m128i*) (vec1+ i)); mmAA= _mm_unpackhi_epi8(mmA, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmAA); mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData); mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData); mmResult= _mm_add_epi64(mmAA, mmResult); mmResult= _mm_add_epi64(mmBB, mmResult); mmAA= _mm_unpacklo_epi8(mmA, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmAA); mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData); mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData); mmResult= _mm_add_epi64(mmAA, mmResult); mmResult= _mm_add_epi64(mmBB, mmResult); } _mm_storeu_si128((__m128i*)res64, mmResult); sumT[omp_get_thread_num()]= res64[0]+ res64[1]; } int i = 0; for (i = 0; i < MOMP_MAX_THREAD_SIZE; i++) { sum += sumT[i]; } i= len- (len&15); for(; i < len; i++ ) { s += vec1[i] * vec1[i]; } return sum + s; } int64 CSISMath::OMP_CCU8_UA(const uchar *vec1, int len) { int s = 0; int64 sum = 0; int64 sumT[MOMP_MAX_THREAD_SIZE]; #pragma omp parallel num_threads(MOMP_MAX_THREAD_SIZE) { int i = 0; int templen= len- 16; int64 res64[2]; __m128i mmResult, mmResult2; __m128i mmA, mmAA, mmBB; __m128i mmZeroData= _mm_setzero_si128(); mmResult= mmZeroData; #pragma omp for private(i) for(i= 0; i <= templen; i+= 16) { mmA= _mm_loadu_si128((__m128i*) (vec1+ i)); mmAA= _mm_unpackhi_epi8(mmA, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmAA); mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData); mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData); mmResult= _mm_add_epi64(mmAA, mmResult); mmResult= _mm_add_epi64(mmBB, mmResult); mmAA= _mm_unpacklo_epi8(mmA, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmAA); mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData); mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData); mmResult= _mm_add_epi64(mmAA, mmResult); mmResult= _mm_add_epi64(mmBB, mmResult); } _mm_storeu_si128((__m128i*)res64, mmResult); sumT[omp_get_thread_num()]= res64[0]+ res64[1]; } int i = 0; for(i= 0; i< MOMP_MAX_THREAD_SIZE; i++) { sum+= sumT[i]; } i= len- (len&15); for(; i < len; i++ ) { s += vec1[i] * vec1[i]; } return sum+ s; } int64 CSISMath::OMP_CCU8_AUA( const uchar * vec1, const uchar * vec2, int len ) { int s = 0; int64 sum = 0; int64 temp[MOMP_MAX_THREAD_SIZE]; #pragma omp parallel num_threads(MOMP_MAX_THREAD_SIZE) { int i = 0; int templen= len- 16; __m128i mmResult, mmResult2; __m128i mmA, mmB; __m128i mmAA, mmBB; __m128i mmZeroData= _mm_setzero_si128(); mmResult= _mm_setzero_si128(); int64 res64[2]; #pragma omp for private(i) for(i= 0; i <= templen; i+= 16) { mmA= _mm_load_si128((__m128i*) (vec1+ i)); mmB= _mm_loadu_si128((__m128i*) (vec2+ i)); mmAA= _mm_unpackhi_epi8(mmA, mmZeroData); mmBB= _mm_unpackhi_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmBB); mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData); mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData); mmResult= _mm_add_epi64(mmAA, mmResult); mmResult= _mm_add_epi64(mmBB, mmResult); mmAA= _mm_unpacklo_epi8(mmA, mmZeroData); mmBB= _mm_unpacklo_epi8(mmB, mmZeroData); mmResult2= _mm_madd_epi16(mmAA, mmBB); mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData); mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData); mmResult= _mm_add_epi64(mmAA, mmResult); mmResult= _mm_add_epi64(mmBB, mmResult); } _mm_storeu_si128((__m128i*)res64, mmResult); temp[omp_get_thread_num()]= res64[0]+ res64[1]; } int i = 0; for(i= 0; i< MOMP_MAX_THREAD_SIZE; i++) { sum+= temp[i]; } i= len- (len&15); for(; i < len; i++ ) { s += vec1[i] * vec1[i]; } return sum+ s; }