/*@@@**************************************************************************
 * \file  dsi
 * \author Hernan Badino
 * \date  Thu Jun 25 11:27:04 EDT 2009
 * \notes 
 *******************************************************************************
 ******************************************************************************/
//#undef _OPENMP

/* INCLUDES */
#include <limits>
#include <math.h>
#include <stdlib.h>
#include <string.h>

#if defined ( _OPENMP )
#include <omp.h>
#endif

template <class ImgDType_, class CostType_> 
inline 
CDispSpaceImageOp<ImgDType_, CostType_>::CDispSpaceImageOp ( ECubeType f_type_e )
        : m_dsiType (          f_type_e ),
          m_scaler_t (                0 )
{
    m_dsi.width_ui     = 0;
    m_dsi.height_ui    = 0;
    m_dsi.dispRange_ui = 0;
    m_dsi.minDisp_i    = 0;
    m_dsi.maxDisp_i    = 0;
    m_dsi.data_p       = NULL;

    m_largeNumber_t = (CostType_) (std::numeric_limits<CostType_>::max() * 0.8);

    setScale();
}

template <class ImgDType_, class CostType_>
inline 
CDispSpaceImageOp<ImgDType_, CostType_>::~CDispSpaceImageOp ( )
{
    printf("Destructing dsi with width = %i height = %i\n",
           m_dsi.width_ui, m_dsi.height_ui );
    
    if (m_dsi.data_p) delete [] m_dsi.data_p;
}

/// Load parameters from parameter file.
template <class ImgDType_, class CostType_> 
inline void
CDispSpaceImageOp<ImgDType_, CostType_>::loadParameters ( const CParamIOHandling &/*f_paramReader*/ )
{

}

/// Set the image size.
template <class ImgDType_, class CostType_>
inline void
CDispSpaceImageOp<ImgDType_, CostType_>::setImageSizes ( unsigned int f_width_ui,
                                                         unsigned int f_height_ui,
                                                         int          f_minDisp_i,
                                                         int          f_maxDisp_i )
{
    if ( m_dsi.width_ui  != f_width_ui ||
         m_dsi.height_ui != f_height_ui ||
         m_dsi.minDisp_i != f_minDisp_i ||
         m_dsi.maxDisp_i != f_maxDisp_i )
    {
        m_dsi.width_ui     = f_width_ui;
        m_dsi.height_ui    = f_height_ui;

        /// max disparity + 3: +1 to achieved the range [0, maxdisp], +2 to be able
        /// to make parabolic interpolation if the best disparity found is 0 or maxdisp.
        m_dsi.maxDisp_i = f_maxDisp_i;
        m_dsi.minDisp_i = f_minDisp_i;

        m_dsi.dispRange_ui = (unsigned int)( m_dsi.maxDisp_i - 
                                             m_dsi.minDisp_i + 1);
        
        long unsigned int size_ui = ( (long int) m_dsi.width_ui  * 
                                      (long int) m_dsi.height_ui * 
                                      (long int) m_dsi.dispRange_ui );
        
        if ( m_dsi.data_p )
            delete [] m_dsi.data_p;

        m_dsi.data_p = new CostType_ [ size_ui ];

        if ( not m_dsi.data_p )
        {
            logger::error("Disparity Space Image data could not be allocated.");
        }

        /// Reset space.
        memset ( m_dsi.data_p, 0x00, size_ui * sizeof(CostType_) );
        
    }
}

/// Computation.
template <class ImgDType_, class CostType_>
inline bool
CDispSpaceImageOp<ImgDType_, CostType_>::compute ( const CTypedImage<ImgDType_> & f_leftImg, 
                                                   const CTypedImage<ImgDType_> & f_rightImg )
{
    if (not f_leftImg.hasSameFormat ( f_rightImg ) ||
        f_leftImg.getBytesPerPixel() != sizeof(ImgDType_) )
    {
        logger::warn ("Left and right images do not have the same format\n");
        return false; 
    }

    switch ( m_dsiType )
    {
        case CT_UVD:
            return computeUVD( f_leftImg, f_rightImg );
        case CT_UDV:
            return computeUDV( f_leftImg, f_rightImg );
        case CT_DUV:
            return computeDUV( f_leftImg, f_rightImg );
        default:
            return false;
    }
}

/// Computation.
template <class ImgDType_, class CostType_>
inline bool
CDispSpaceImageOp<ImgDType_, CostType_>::computeUVD ( const CTypedImage<ImgDType_> & f_leftImg, 
                                                      const CTypedImage<ImgDType_> & f_rightImg )
{
    ImgDType_ * const left_p  = (ImgDType_ *) f_leftImg.getData();
    ImgDType_ * const right_p = (ImgDType_ *) f_rightImg.getData();

    bool success_b = true;

    CostType_ denominator = pow(2., m_scaler_t);

#if defined ( _OPENMP )
    const unsigned int numThreads_ui = omp_get_max_threads();
#pragma omp parallel for num_threads(numThreads_ui) schedule(dynamic)
#endif
    for (int d = m_dsi.minDisp_i; d <= m_dsi.maxDisp_i; ++d)
    {
        ImgDType_ * l_p = left_p;
        ImgDType_ * r_p = right_p;

        CostType_ * dst_p = 
            m_dsi.data_p +
            (d - m_dsi.minDisp_i) * (m_dsi.width_ui * m_dsi.height_ui);

        if (d < 0)
            r_p -= d;
        else
        {
            l_p += d;
            dst_p += d;
        }

        int offset_i = abs(d);
        int width_i = m_dsi.width_ui - offset_i;

        for ( int v = 0; 
              v < (signed)m_dsi.height_ui; 
              ++v, l_p+=offset_i, r_p+=offset_i, dst_p+=offset_i)
        {
            for (int u = 0; u < width_i; ++u, ++l_p, ++r_p, ++dst_p)
            {
                //*dst_p = ((CostType_)(*l_p) - (CostType_)(*r_p))/m_scaler_t;
                //*dst_p = (CostType_) ((*l_p - *r_p) >> m_scaler_t);
                *dst_p = (CostType_) ((*l_p - *r_p) / denominator);
#if 0
                if (v == 323/2)
                {
                    long int diff1 = (dst_p - m_dsi.data_p);
                    long int diff2 = (l_p - left_p);
                    long int diff3 = (r_p - right_p);
                    long int dd = diff1 / (m_dsi.width_ui * m_dsi.height_ui);
                    long int dv = (diff1 - dd * (m_dsi.width_ui * m_dsi.height_ui)) / m_dsi.width_ui;
                    long int du = diff1 % m_dsi.width_ui;

                    printf("UVD u: %i v: %i d: %i dest (%li %li %li) is the difference between L(%li, %li) an R(%li, %li) (val1: %f, val2: %f: diff: %f)\n",
                           u, v, d,
                           dv, du, dd,
                           (diff2/m_dsi.width_ui), diff2%m_dsi.width_ui,
                           (diff3/m_dsi.width_ui), diff3%m_dsi.width_ui, 
                           (float)(*l_p), (float)(*r_p), (float)(*dst_p) );
                    
                }
#endif
            }
        }
    }

    return success_b;
}


/// Computation.
template <class ImgDType_, class CostType_>
inline bool
CDispSpaceImageOp<ImgDType_, CostType_>::computeUDV ( const CTypedImage<ImgDType_> & /*f_leftImg*/, 
                                                      const CTypedImage<ImgDType_> & /*f_rightImg*/ )
{
    logger::warn("Not implemented so far");   
    return false;
}


/// Computation.
template <class ImgDType_, class CostType_>
inline bool
CDispSpaceImageOp<ImgDType_, CostType_>::computeDUV ( const CTypedImage<ImgDType_> & f_leftImg, 
                                                      const CTypedImage<ImgDType_> & f_rightImg )
{
    ImgDType_ * const left_p  = (ImgDType_ *) f_leftImg.getData();
    ImgDType_ * const right_p = (ImgDType_ *) f_rightImg.getData();

    bool success_b = true;
 
    CostType_ denominator = pow(2., m_scaler_t);
    

#if defined ( _OPENMP )
    const unsigned int numThreads_ui = omp_get_max_threads();
#pragma omp parallel for num_threads(numThreads_ui) schedule(dynamic)
#endif
    for ( int v = 0; v < (signed)m_dsi.height_ui; ++v )
    {
        int u = 0;
        
        ImgDType_ * l_p = left_p  + v * m_dsi.width_ui + u;
        ImgDType_ * r_p = right_p + v * m_dsi.width_ui + u;

        for ( ; u < (signed)m_dsi.width_ui; ++u, ++l_p, ++r_p )
        {
            int firstRCol_i = std::max(u-m_dsi.maxDisp_i, 0);
            int maxd_i = u - firstRCol_i;
            int mind_i = std::max (m_dsi.minDisp_i, (int)(u + 1) - (int)m_dsi.width_ui);
            //int dispRange_i  = ( u - firstRCol_i + 1 ) - m_dsi.minDisp_i; 
            // because of one disp to the right.

            CostType_ *dst_p= 
                m_dsi.getDispScanline( v, u ) - m_dsi.minDisp_i;

            for ( int d = mind_i; d <= maxd_i; ++d, ++dst_p )
            {
                //dst_p[d] = (CostType_) ((*l_p - r_p[-d])/m_scaler_t);
                //dst_p[d] = (CostType_) ( (*l_p - r_p[-d]) >> m_scaler_t );
                dst_p[d] = (CostType_) ( (*l_p - r_p[-d]) / denominator );
                //dst_p[d] *= dst_p[d];
                //*dst_p = (CostType_) ((*l_p - *r_p) / denominator);
                //dst_p[d] = (((CostType_)(*l_p) - (CostType_)r_p[-d]))/m_scaler_t;
#if 0
                if (v == 323/2)
                {
                    long int diff1 = (dst_p - m_dsi.data_p) + d;
                    long int diff2 = (l_p - left_p);
                    long int diff3 = (r_p - right_p) - d;
                    long int dv = diff1 / (m_dsi.width_ui * m_dsi.dispRange_ui);
                    long int du = (diff1 - dv * (m_dsi.width_ui * m_dsi.dispRange_ui)) / m_dsi.dispRange_ui;
                    long int dd = diff1 % m_dsi.dispRange_ui;

                    printf("DUV u: %i v: %i d: %i dest (%li %li %li) is the difference between L(%li, %li) an R(%li, %li) (val1: %f, val2: %f: diff: %f)\n",
                           u, v, d,
                           dv, du, dd,
                           (diff2/m_dsi.width_ui), diff2%m_dsi.width_ui,
                           (diff3/m_dsi.width_ui), diff3%m_dsi.width_ui, 
                           (float)(*l_p), (float)(r_p[-d]), (float)(dst_p[d]) );
                }
#endif
            }
            
        }
    }

    return success_b;
}

/// Load parameters from parameter file.

template <class ImgDType_, class CostType_>
inline void
CDispSpaceImageOp<ImgDType_, CostType_>::setScale()
{
    printf("Setting default scaler to 0\n");
    m_scaler_t = 0;
}

/// Load parameters from parameter file.
template <> 
inline void
CDispSpaceImageOp<unsigned short int, short int>::setScale()
{
    m_scaler_t = 2;
}

/// Load parameters from parameter file.
template <> 
inline void
CDispSpaceImageOp<unsigned short int, char>::setScale()
{
    m_scaler_t = 13;
}

/// Load parameters from parameter file.
template <> 
inline void
CDispSpaceImageOp<unsigned short int, signed char>::setScale()
{
    m_scaler_t = 9;
}

/// Load parameters from parameter file.
template <> 
inline void
CDispSpaceImageOp<unsigned int, int>::setScale()
{
    m_scaler_t = 2;
}

/// Load parameters from parameter file.
template <> 
inline void
CDispSpaceImageOp<unsigned int, short int>::setScale()
{
    m_scaler_t = 17;
}

/// Load parameters from parameter file.
template <> 
inline void
CDispSpaceImageOp<unsigned int, char>::setScale()
{
    m_scaler_t = 25;

}

/// Computation.
template <class ImgDType_, class CostType_>
inline bool
CDispSpaceImageOp<ImgDType_, CostType_>::computeZSSD ( const CTypedImage<ImgDType_> & f_leftImg, 
                                                       const CTypedImage<ImgDType_> & f_rightImg,
                                                       const int      f_kW_i,
                                                       const int      f_kH_i,
                                                       const bool     f_normalize_b )
{
    /// For debugging purposes.
#define PL(p) (p-left_p)/m_dsi.width_ui, (p-left_p)%m_dsi.width_ui
#define PR(p) (p-right_p)/m_dsi.width_ui, (p-right_p)%m_dsi.width_ui
#define PC    (cs_p-colSum_p), (cs2_p-colSum2_p)
#define DST   (dst_p - m_dsi.data_p) / (m_dsi.width_ui * m_dsi.height_ui), \
              ((dst_p - m_dsi.data_p) - ((dst_p - m_dsi.data_p) / (m_dsi.width_ui * m_dsi.height_ui)) * (m_dsi.width_ui * m_dsi.height_ui)) / m_dsi.width_ui, \
              (dst_p - m_dsi.data_p) % m_dsi.width_ui, *dst_p

    ImgDType_ * const left_p  = (ImgDType_ *) f_leftImg.getData();
    ImgDType_ * const right_p = (ImgDType_ *) f_rightImg.getData();


    const int kW_i = f_kW_i + (1-f_kW_i%2);
    const int kH_i = f_kH_i + (1-f_kH_i%2);
    
    const int n_i = kW_i * kH_i;

    const float norm_f = 100000.; 
    
    const bool sqrt_b  = false;

    const bool truncate_b  = false;
    const float maxValue_f  = 9;
    
    float colSumArray_p [16][2048];
    float colSum2Array_p[16][2048];

#if defined ( _OPENMP )
    const unsigned int numThreads_ui = std::min(omp_get_max_threads(), 16);
#pragma omp parallel for num_threads(numThreads_ui) schedule(dynamic)
#endif
    for (int d = m_dsi.minDisp_i; d <= m_dsi.maxDisp_i; ++d)
    {
#ifdef _OPENMP
        int f_threadIdx_i = omp_get_thread_num();
        float *colSum_p  = colSumArray_p[f_threadIdx_i];
        float *colSum2_p = colSum2Array_p[f_threadIdx_i];
#else
        float *colSum_p  = colSumArray_p[0];
        float *colSum2_p = colSum2Array_p[0];
#endif  

        ImgDType_ * l_p;
        ImgDType_ * r_p;
        ImgDType_ * ll_p;
        ImgDType_ * rr_p;

        float     * cs_p, * cs2_p;
        float     val_f, oldVal_f, newVal_f;

        int offsetR_i, offsetL_i;

        if (d < 0)
        {
            offsetL_i = 0;
            offsetR_i = -d;
        }
        else
        {
            offsetL_i = d;
            offsetR_i = 0;
        }

        int offset_i = abs(d);
        int width_i  = m_dsi.width_ui - offset_i;
        //int height_i = m_dsi.height_ui;

        l_p = left_p  + offsetL_i;
        r_p = right_p + offsetR_i;
        
        cs_p  = colSum_p  + offsetL_i;
        cs2_p = colSum2_p + offsetL_i;

        /// Initialize vector containing sums.
        for (int u = 0; u < width_i; ++u, 
                 ++l_p, ++r_p, 
                 ++cs_p, ++cs2_p)
        {
            val_f  = (*l_p - *r_p)/norm_f;
            *cs_p  = val_f;
            *cs2_p = val_f*val_f;
        }

        /// Go to next row.
        l_p   += offset_i;
        r_p   += offset_i;

        /// Calculate initial sums.
        for (int v = 1; v < kH_i; ++v)
        {
            cs_p  = colSum_p  + offsetL_i;
            cs2_p = colSum2_p + offsetL_i;
            
            /// Accumulate for every column.
            for (int u = 0; u < width_i; ++u, 
                     ++l_p, ++r_p,
                     ++cs_p, ++cs2_p)
            {
                val_f  = (*l_p - *r_p)/norm_f;
                *cs_p  += val_f;
                *cs2_p += val_f*val_f;
            }

            /// Go to next row.
            l_p   += offset_i;
            r_p   += offset_i;
        }

        const int lastRow_i = m_dsi.height_ui - kH_i/2-1;
        /// Iterate now for all valid rows.
        for ( int v = kH_i/2; v <= lastRow_i; ++v)
        {
            //CostType_ *dst_p = m_dsi.getPtrAtDispRowColumn ( d, v, kW_i/2 + offsetL_i); //!111
            CostType_ *dst_p = m_dsi.getPtrAtRowColumnDisp ( v, kW_i/2 + offsetL_i, d);//!111
            
            /// Compute first result as the sum of the sum column vectors.
            cs_p  = colSum_p  + offsetL_i;
            cs2_p = colSum2_p + offsetL_i;
            
            float sum_f  = *cs_p;
            float sum2_f = *cs2_p;

            ++cs_p;
            ++cs2_p;
            
            for ( int u = 1; u < kW_i; ++u, 
                      ++cs_p, ++cs2_p )
            {
                sum_f  += *cs_p;
                sum2_f += *cs2_p;
            }

            CostType_ avg_t = sum_f / n_i;
            *dst_p = (sum2_f / n_i - avg_t * avg_t) * norm_f;

            if ( f_normalize_b )
                *dst_p /= n_i;

            if ( sqrt_b )
                *dst_p = sqrt(*dst_p);

            if ( truncate_b )
                if ( *dst_p > maxValue_f )
                    *dst_p = 0;//maxValue_f;
            
            //++dst_p; //!111

            /// Initialize pointers.
            //// Subtraction pointers.
            l_p = left_p  + (v-kH_i/2) * m_dsi.width_ui + offsetL_i;
            r_p = right_p + (v-kH_i/2) * m_dsi.width_ui + offsetR_i;
            
            //// Addition pointers.
            ll_p = l_p + kH_i * m_dsi.width_ui;
            rr_p = r_p + kH_i * m_dsi.width_ui;
            
            /// 
            cs_p  = colSum_p  + offsetL_i;
            cs2_p = colSum2_p + offsetL_i;

            if (0)
                for ( int u = 0; u < kH_i; ++u)
                {
                    dst_p  = m_dsi.getPtrAtRowColumnDisp ( v, kW_i/2 + offsetL_i + u - kW_i + 1, d);//!111
                    *dst_p = m_largeNumber_t;
                }

            for ( int u = kW_i; u < width_i; ++u,
                      ++cs_p,   ++cs2_p, 
                      ++l_p,    ++r_p, 
                      ++ll_p,   ++rr_p ) //, ++dst_p )//!111
            {
                dst_p = m_dsi.getPtrAtRowColumnDisp ( v, kW_i/2 + offsetL_i + u - kW_i + 1, d);//!111
                sum_f  += *(cs_p +kW_i) - *cs_p;
                sum2_f += *(cs2_p+kW_i) - *cs2_p;
                
                /// Write result.
                avg_t  = sum_f  / n_i;
                *dst_p = (sum2_f / n_i - avg_t * avg_t) * norm_f;

                if ( f_normalize_b )
                    *dst_p /= n_i;

                if ( sqrt_b )
                    *dst_p = sqrt(*dst_p);
                
                if ( truncate_b )
                    if ( *dst_p > maxValue_f )
                        *dst_p = 0;//maxValue_f;

                if (v != lastRow_i)
                {
                    /// This column sum can now be updated for the next row.
                    oldVal_f = (*l_p  - *r_p)/norm_f;
                    newVal_f = (*ll_p - *rr_p)/norm_f;
                    
                    *cs_p  += newVal_f - oldVal_f;
                    *cs2_p += newVal_f*newVal_f - oldVal_f*oldVal_f;
                }
            }
            
            if (0)
                for ( int u = width_i - kH_i; u < width_i; ++u)
                {
                    dst_p  = m_dsi.getPtrAtRowColumnDisp ( v, kW_i/2 + offsetL_i + u - kW_i + 1, d);//!111
                    *dst_p = m_largeNumber_t;
                }

            if (v != lastRow_i)
            {
                /// Update the rest of the column sums.
                for ( int u = 0; u < kW_i; ++u,
                          ++cs_p,   ++cs2_p, 
                          ++l_p,    ++r_p, 
                          ++ll_p,   ++rr_p )
                {
                    oldVal_f = (*l_p  - *r_p) /norm_f;
                    newVal_f = (*ll_p - *rr_p)/norm_f;
                    
                    *cs_p  += newVal_f - oldVal_f;
                    *cs2_p += newVal_f*newVal_f - oldVal_f*oldVal_f;
                    
                }
            }
        }
    }
    return true;
}
