~liuyang/MiniLED_ADM.git

// SISMath.cpp: implementation of the CSISMath class.
//
//////////////////////////////////////////////////////////////////////
 
#include "stdafx.h"
#include "SISMath.h"
 
 
#include <emmintrin.h>
#include <tmmintrin.h>
#include <omp.h>
 
#ifdef _DEBUG
#define new DEBUG_NEW
#undef THIS_FILE
static char THIS_FILE[] = __FILE__;
#endif
 
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
 
#define SIS_SIMD
 
#define MOMP_MAX_THREAD_SIZE 4
 
//#define SIS_ASM
 
 
int64 CSISMath::OMP_CCU8_A(const uchar * vec1, int len)
{
    int s = 0;
    int64 sum = 0;
 
    int64 sumT[MOMP_MAX_THREAD_SIZE] = {0};
#pragma omp parallel num_threads(MOMP_MAX_THREAD_SIZE)
    {
        int i= 0;
        int templen= len- 16;
        __m128i    mmResult, mmResult2;
        __m128i    mmA;
        __m128i    mmAA, mmBB;    
        __m128i mmZeroData= _mm_setzero_si128();
 
        int64 res64[2];
        mmResult= _mm_setzero_si128();
#pragma omp for private(i)
        for(i= 0; i <= templen; i+= 16)
        {
            mmA= _mm_load_si128((__m128i*) (vec1+ i));
 
            mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
            mmResult2= _mm_madd_epi16(mmAA, mmAA);
            mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
            mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
            mmResult= _mm_add_epi64(mmAA, mmResult);
            mmResult= _mm_add_epi64(mmBB, mmResult);
 
            mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
            mmResult2= _mm_madd_epi16(mmAA, mmAA);
            mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
            mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
            mmResult= _mm_add_epi64(mmAA, mmResult);
            mmResult= _mm_add_epi64(mmBB, mmResult);
        }
        _mm_storeu_si128((__m128i*)res64, mmResult);
        sumT[omp_get_thread_num()]= res64[0]+ res64[1];
    }
 
    int i = 0;
    for (i = 0; i < MOMP_MAX_THREAD_SIZE; i++)
    {
        sum += sumT[i];
    }
 
    i= len- (len&15);
    for(; i < len; i++ )
    {
        s += vec1[i] * vec1[i];
    }
 
    return sum + s;
}
 
int64    CSISMath::OMP_CCU8_UA(const uchar *vec1, int len)
{
    int s = 0;
    int64 sum = 0;
    
    int64 sumT[MOMP_MAX_THREAD_SIZE];
#pragma omp parallel num_threads(MOMP_MAX_THREAD_SIZE)
    {
        int i = 0;
        int templen= len- 16;
        int64    res64[2];
        __m128i    mmResult, mmResult2;
        __m128i mmA, mmAA, mmBB;
        __m128i mmZeroData= _mm_setzero_si128();
        mmResult= mmZeroData;
#pragma omp for private(i)
        for(i= 0; i <= templen; i+= 16)
        {
            mmA= _mm_loadu_si128((__m128i*) (vec1+ i));
 
            mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
            mmResult2= _mm_madd_epi16(mmAA, mmAA);
            mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
            mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
            mmResult= _mm_add_epi64(mmAA, mmResult);
            mmResult= _mm_add_epi64(mmBB, mmResult);
 
            mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
            mmResult2= _mm_madd_epi16(mmAA, mmAA);
            mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
            mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
            mmResult= _mm_add_epi64(mmAA, mmResult);
            mmResult= _mm_add_epi64(mmBB, mmResult);
        }
        _mm_storeu_si128((__m128i*)res64, mmResult);
        sumT[omp_get_thread_num()]= res64[0]+ res64[1];
    }
 
    int i = 0;
    for(i= 0; i< MOMP_MAX_THREAD_SIZE; i++)
    {
        sum+= sumT[i];
    }
 
 
    i= len- (len&15);
    for(; i < len; i++ )
    {
        s += vec1[i] * vec1[i];
    }
 
    return sum+ s;
}
 
int64    CSISMath::OMP_CCU8_AUA( const uchar * vec1, const uchar * vec2, int len )
{
    int s = 0;
    int64 sum = 0;
    int64 temp[MOMP_MAX_THREAD_SIZE];
    
 
 
#pragma omp parallel num_threads(MOMP_MAX_THREAD_SIZE)
    {
        int i = 0;
        int    templen= len- 16;
        __m128i    mmResult, mmResult2;
        __m128i    mmA, mmB;
        __m128i    mmAA, mmBB;
        __m128i mmZeroData= _mm_setzero_si128();
        mmResult= _mm_setzero_si128();
        int64 res64[2];
 
#pragma omp for private(i)
        for(i= 0; i <= templen; i+= 16)
        {
            mmA= _mm_load_si128((__m128i*) (vec1+ i));
            mmB= _mm_loadu_si128((__m128i*) (vec2+ i));
 
            mmAA= _mm_unpackhi_epi8(mmA, mmZeroData);
            mmBB= _mm_unpackhi_epi8(mmB, mmZeroData);
            mmResult2= _mm_madd_epi16(mmAA, mmBB);
            mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
            mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
            mmResult= _mm_add_epi64(mmAA, mmResult);
            mmResult= _mm_add_epi64(mmBB, mmResult);
 
            mmAA= _mm_unpacklo_epi8(mmA, mmZeroData);
            mmBB= _mm_unpacklo_epi8(mmB, mmZeroData);
            mmResult2= _mm_madd_epi16(mmAA, mmBB);
            mmAA= _mm_unpackhi_epi32(mmResult2, mmZeroData);
            mmBB= _mm_unpacklo_epi32(mmResult2, mmZeroData);
            mmResult= _mm_add_epi64(mmAA, mmResult);
            mmResult= _mm_add_epi64(mmBB, mmResult);
        }
        _mm_storeu_si128((__m128i*)res64, mmResult);
        temp[omp_get_thread_num()]= res64[0]+ res64[1];
    }
    int i = 0;
 
    for(i= 0; i< MOMP_MAX_THREAD_SIZE; i++)
    {
        sum+= temp[i];
    }
 
    i= len- (len&15);
    for(; i < len; i++ )
    {
        s += vec1[i] * vec1[i];
    }
    return sum+ s;
}