
/*@@@**************************************************************************
 ** \file  prewittFilter_inline
 * \date   Thu Nov 12 13:56:53 EST 2009
 * \author Hernan Badino
 * \notes  
*******************************************************************************
*****          (C) COPYRIGHT Hernan Badino - All Rights Reserved          *****
******************************************************************************/

#include "ippDefs.h"

#if defined ( _OPENMP )
 #include <omp.h>
#endif

template <class _SrcType, class _DstType>
inline bool
CPrewittFilter<_SrcType, _DstType>::computeVertical ( const CTypedImage<_SrcType> &f_srcImg,
                                                      CTypedImage<_DstType>       &fr_dstImg,
                                                      const S2D<unsigned int>     f_maskSize,
                                                      const double                f_norm_d )
{
    if ( f_maskSize.width  == 3 &&
         f_maskSize.height == 3 )
        return computeVertical3x3 ( f_srcImg, fr_dstImg, f_norm_d );
    else if ( f_maskSize.width  == 5 &&
              f_maskSize.height == 5 )
        return computeVertical5x5 ( f_srcImg, fr_dstImg, f_norm_d );
    else if ( f_maskSize.width  == 7 &&
              f_maskSize.height == 7 )
        return computeVertical7x7 ( f_srcImg, fr_dstImg, f_norm_d );
    else if ( f_maskSize.width  == 9 &&
              f_maskSize.height == 9 )
        return computeVertical9x9 ( f_srcImg, fr_dstImg, f_norm_d );
    
    if ( fr_dstImg.getWidth()  < f_srcImg.getWidth() ||
         fr_dstImg.getHeight() < f_srcImg.getHeight() ||
         f_srcImg.getHeight() < f_maskSize.height )
        return false;
    
#if 1
    const S2D<int> kS ( f_maskSize.width,f_maskSize.height);
    const S2D<unsigned int> hKS( f_maskSize.width/2, 
                                 f_maskSize.height/2 );

    const int h_i = f_srcImg.getSize().height;
    const int w_i = f_srcImg.getSize().width;

    const double norm_f = kS.height * (kS.width-1) * f_norm_d;
        
#if defined ( _OPENMP )
    const int numThreads_i = omp_get_max_threads();
    
    int numP_i = numThreads_i;
    /// Check how many processes must start according to the height of the image.
    if ( h_i / numThreads_i < kS.height )
        numP_i = std::min(std::max(1, h_i / kS.height ), numThreads_i);    

    int hp_i = h_i / numP_i;
    _DstType * sumsVector_p = new _DstType[w_i*numP_i];

#pragma omp parallel for num_threads(numP_i) schedule(dynamic)
    for (int p = 0; p < numP_i; ++p)
    {
        int rowBot_i, rowTop_i;

        rowTop_i = std::max(0,  p * hp_i - (int) hKS.height);
        if ( p == (numP_i-1) )
            rowBot_i = h_i-1;
        else
            rowBot_i = ((p+1) * hp_i - 1) + hKS.height;

        _DstType * sums_p = sumsVector_p + p * w_i;

        //printf("Computing Vertical prewitt (hor gradient) from rowBot_i = %i to rowTop_i = %i with kS.height = %i and h_i = %i numThreads_i = %i and maxP = %i\n",
        //       rowBot_i, rowTop_i, kS.height, h_i, numThreads_i, numP_i );
#else
    _DstType * sums_p = new _DstType[w_i];
    int rowTop_i = 0;
    int rowBot_i = h_i-1;
    {
#endif
        memset(sums_p, 0, sizeof(_DstType) * w_i);

        /// Compute first vector sum.
        for (int j = 0 ; j < w_i; ++j)
        {
            for (int i = rowTop_i; i < rowTop_i + kS.height; ++i)
            {
                sums_p[j] += f_srcImg.getScanline(i)[j];
            }
        }

        for (int i = rowTop_i; i <= rowBot_i-kS.height; ++i)
        {
            _SrcType * range1_p = f_srcImg.getScanline(i+kS.height);
            _SrcType * range2_p = f_srcImg.getScanline(i);
            _DstType * ptrsum_p = sums_p;

            _DstType  sumL_f;
            _DstType  sumR_f;

            /// Make first sums.
            _DstType * ptrL_p = sums_p;
            _DstType * ptrR_p = sums_p + kS.width - 1;
        
            _DstType *dst_p = fr_dstImg.getScanline(i+hKS.height) + hKS.width;

            for (int j = hKS.width; j < (int)(w_i - hKS.width); 
                 ++j, ++dst_p, ++ptrL_p, ++ptrR_p, ++range1_p, ++range2_p, ++ptrsum_p)
            {
                float div_d = hKS.width;
                sumL_f = *ptrL_p/div_d;
                sumR_f = *ptrR_p/div_d;
            
                for (int k = 1; k < (int)hKS.width; ++k)
                {   
                    --div_d;
                    sumL_f += ptrL_p[ k]/div_d;
                    sumR_f += ptrR_p[-k]/div_d;
                }

                *dst_p = (_DstType) ((sumR_f - sumL_f)/norm_f);
                //printf ("PrewU[%i][%i] = %f\n", i+hKS.height, j,  (sumR_f - sumL_f)/norm_f );
                        
                *ptrsum_p += *range1_p - *range2_p;
            }

            for (int j = 0; j < (int)kS.width-1; ++j, ++range1_p, ++range2_p, ++ptrsum_p)
            {
                *ptrsum_p += *range1_p - *range2_p;
            }
        }

        // process now last row without updating sums.
        _DstType sumL_f;
        _DstType sumR_f;
     
        /// Make first sums.
        _DstType * ptrL_p = sums_p;
        _DstType * ptrR_p = sums_p + kS.width - 1;

        _DstType *dst_p = fr_dstImg.getScanline(rowBot_i-hKS.height) + hKS.width;

        for (int j = hKS.width; j < (int)(w_i - hKS.width); ++j, ++dst_p, ++ptrL_p, ++ptrR_p)
        {
            /// Compute sums.
            float div_d = hKS.width;
            sumL_f = *ptrL_p/div_d;
            sumR_f = *ptrR_p/div_d;
        
            for (int k = 1; k < (int)hKS.width; ++k)
            {   
                --div_d;
                sumL_f += ptrL_p[ k]/div_d;
                sumR_f += ptrR_p[-k]/div_d;
            }
        
            *dst_p = (_DstType) ((sumR_f - sumL_f)/norm_f);
            //printf ("PrewU[%i][%i] = %f\n", h_i-hKS.height-1, j,  (sumR_f - sumL_f)/norm_f );
        }
    }
    
#if defined ( _OPENMP )
    delete [] sumsVector_p;
#else
    delete [] sums_p;
#endif

#else
    if (1)
    {
        
        /// DEFAULT STANDARD NON-OPTIMIZED IMPLEMENTATION.
        /// FOR DEBUGING PURPOSES.

        S2D<int> kS ( f_maskSize.width,f_maskSize.height);
        S2D<unsigned int> hKS( f_maskSize.width/2, 
                               f_maskSize.height/2 );
    
        const int h_i = f_srcImg.getSize().height;
        const int w_i = f_srcImg.getSize().width;
    
        const double norm_f = kS.height * (kS.width-1) * f_norm_d;

        /// Compute first vector sum.
        for (int i = hKS.height; i < h_i - (int)hKS.height; ++i)
        {
            for (int j = hKS.width ; j < w_i - (int)hKS.width; ++j)
            {
                _DstType sum_f(0);
                for (int k = i-(int)hKS.height; k <= i+(int)hKS.height; ++k)
                {
                    for (int l = j-(int)hKS.width ; l < j; ++l)
                        sum_f -= f_srcImg.getScanline(k)[l]/(j-l);
                
                    for (int l = j+1 ; l <= j+(int)hKS.width; ++l)
                        sum_f += f_srcImg.getScanline(k)[l]/(l-j);                
                }
                fr_dstImg.getScanline(i)[j] = sum_f/norm_f;
                //printf ("PrewU[%i][%i] = %f\n", i, j,  sum_f/norm_f );
            }
        }    
    }
    else
    {
        
        /// DEFAULT STANDARD NON-OPTIMIZED IMPLEMENTATION.
        /// FOR DEBUGING PURPOSES.

        S2D<int> kS ( f_maskSize.width,f_maskSize.height);
        S2D<unsigned int> hKS( f_maskSize.width/2, 
                               f_maskSize.height/2 );
    
        const int h_i = f_srcImg.getSize().height;
        const int w_i = f_srcImg.getSize().width;
    

        /// Compute first vector sum.
        for (int i = hKS.height; i < h_i - (int)hKS.height; ++i)
        {
            for (int j = hKS.width ; j < w_i - (int)hKS.width; ++j)
            {
                double w;
                double sum_f = 0;
                double norm_f = 0;
                for (int k = i-(int)hKS.height; k <= i+(int)hKS.height; ++k)
                {
                    for (int l = j-(int)hKS.width ; l < j; ++l)
                    {
                        w = exp(-(l-j)*(l-j) - (k-i)*(k-i)/(kS.width*kS.height));
                        norm_f += w;
                        sum_f  -= w * f_srcImg.getScanline(k)[l]/(j-l);
                    }
                    
                    for (int l = j+1 ; l <= j+(int)hKS.width; ++l)
                    {
                        w = exp(-(l-j)*(l-j) - (k-i)*(k-i)/(kS.width*kS.height));
                        norm_f += w;
                        sum_f  += w * f_srcImg.getScanline(k)[l]/(l-j);
                    }
                }

                fr_dstImg.getScanline(i)[j] = (_DstType) (sum_f/(f_norm_d*norm_f));
                //printf ("PrewU[%i][%i] = %f\n", i, j,  sum_f/norm_f );
            }
        }
    }
 #endif

    return true;
}

template <class _SrcType, class _DstType>
inline bool
CPrewittFilter<_SrcType, _DstType>::computeHorizontal ( const CTypedImage<_SrcType> &f_srcImg,
                                                        CTypedImage<_DstType>       &fr_dstImg,
                                                        const S2D<unsigned int>     f_maskSize,
                                                        const double                f_norm_d )
{
     if ( f_maskSize.width  == 3 &&
          f_maskSize.height == 3 )
         return computeHorizontal3x3 ( f_srcImg, fr_dstImg, f_norm_d );
     else if ( f_maskSize.width  == 5 &&
               f_maskSize.height == 5 )
         return computeHorizontal5x5 ( f_srcImg, fr_dstImg, f_norm_d );
     else if ( f_maskSize.width  == 7 &&
               f_maskSize.height == 7 )
         return computeHorizontal7x7 ( f_srcImg, fr_dstImg, f_norm_d );
     else if ( f_maskSize.width  == 9 &&
               f_maskSize.height == 9 )
         return computeHorizontal9x9 ( f_srcImg, fr_dstImg, f_norm_d );
     
    if ( fr_dstImg.getWidth()  < f_srcImg.getWidth() ||
         fr_dstImg.getHeight() < f_srcImg.getHeight() ||
         f_srcImg.getWidth() < f_maskSize.width )
        return false;

#if 1
    const S2D<int> kS ( f_maskSize.width,f_maskSize.height);
    const S2D<unsigned int> hKS( f_maskSize.width/2, 
                                 f_maskSize.height/2 );

    const int h_i = f_srcImg.getSize().height;
    const int w_i = f_srcImg.getSize().width;

    const double norm_f = kS.width * (kS.height-1) * f_norm_d;

#if defined ( _OPENMP )
    const int numThreads_i = omp_get_max_threads();
    
    int numP_i = numThreads_i;
    /// Check how many processes must start according to the height of the image.
    if ( w_i / numThreads_i < kS.width )
        numP_i = std::min(std::max(1, w_i / kS.width ), numThreads_i);    

    int wp_i = w_i / numP_i;
    _DstType * sumsVector_p = new _DstType[h_i*numP_i];

#pragma omp parallel for num_threads(numP_i) schedule(dynamic)
    for (int p = 0; p < numP_i; ++p)
    {
        int colLef_i, colRig_i;

        colLef_i = std::max(0,  p * wp_i - (int) hKS.width);
        if ( p == (numP_i-1) )
            colRig_i = w_i-1;
        else
            colRig_i = ((p+1) * wp_i - 1) + hKS.width;

        _DstType * sums_p = sumsVector_p + p * h_i;

        //printf("Computing Horizontal prewitt (ver gradient) from colLef_i = %i to colRig_i = %i with kS.height = %i and w_i = %i numThreads_i = %i and maxP = %i\n",
        //       colLef_i, colRig_i, kS.width, w_i, numThreads_i, numP_i );
#else
    _DstType * sums_p = new _DstType[h_i];
    int colLef_i = 0;
    int colRig_i = w_i-1;
    {
#endif
        memset(sums_p, 0, sizeof(_DstType) * h_i);
    
        /// Compute first vector sum.
        for (int i = 0; i < h_i; ++i)
        {
            for (int j = colLef_i ; j < colLef_i + kS.width; ++j)
            {
                sums_p[i] += f_srcImg.getScanline(i)[j];
            }
        }
                
        for (int j = colLef_i; j <= colRig_i-kS.width; ++j)
        {
            _SrcType * range1_p = f_srcImg.getScanline(0) + j+kS.width;
            _SrcType * range2_p = f_srcImg.getScanline(0) + j;
            _DstType * ptrsum_p = sums_p;

            _DstType sumT_f;
            _DstType sumB_f;

            /// Make first sums.
            _DstType * ptrT_p = sums_p;
            _DstType * ptrB_p = sums_p + kS.height - 1;

            _DstType *dst_p = fr_dstImg.getScanline(hKS.height) + j+hKS.width;
     
            for (int i = hKS.height; i < (int)(h_i - hKS.height); ++i, 
                     dst_p+=w_i, ++ptrT_p, ++ptrB_p, range1_p+=w_i, range2_p+=w_i, ++ptrsum_p)
            {
                float div_d = hKS.height;
                sumT_f = *ptrT_p/div_d;
                sumB_f = *ptrB_p/div_d;
            
                for (int k = 1; k < (int)hKS.height; ++k)
                {   
                    --div_d;
                    sumT_f += ptrT_p[ k]/div_d;
                    sumB_f += ptrB_p[-k]/div_d;
                }

                *dst_p =  (_DstType) ((sumT_f - sumB_f)/norm_f);
                //printf ("PrewV[%03i][%03i] = %f\n", i, j+hKS.width,  (sumT_f - sumB_f)/norm_f );

                *ptrsum_p += *range1_p - *range2_p;
            }

            for (int i = 0 ; i < kS.height-1; ++i, range1_p+=w_i, range2_p+=w_i, ++ptrsum_p)
            {
                *ptrsum_p += *range1_p - *range2_p;
            }
        }

        // process now last column without updating sums.
        _DstType sumT_f;
        _DstType sumB_f;
                
        /// Make first sums.
        _DstType * ptrT_p = sums_p;
        _DstType * ptrB_p = sums_p + kS.height - 1;
                
        _DstType *dst_p = fr_dstImg.getScanline(hKS.height) + colRig_i - hKS.width;

        for (int i = hKS.height; i < (int)(h_i - hKS.height); ++i, dst_p+=w_i, ++ptrT_p, ++ptrB_p)
        {
            /// Compute sums.
            float div_d = hKS.height;
            sumT_f = *ptrT_p/div_d;
            sumB_f = *ptrB_p/div_d;
        
            for (int k = 1; k < (int)hKS.height; ++k)
            {   
                --div_d;
                sumT_f += ptrT_p[ k]/div_d;
                sumB_f += ptrB_p[-k]/div_d;
            }

            *dst_p =  (_DstType) ((sumT_f - sumB_f)/norm_f);
            //printf ("PrewV[%03i][%03i] = %f\n", i, w_i-hKS.width-1,  (sumT_f - sumB_f)/norm_f );
        }

    }
    
#if defined ( _OPENMP )
    delete [] sumsVector_p;
#else
    delete [] sums_p;
#endif

#else
    if (1)
    {
        /// DEFAULT STANDARD NON-OPTIMIZED IMPLEMENTATION.
        /// FOR DEBUGING PURPOSES.
        S2D<int> kS ( f_maskSize.width,f_maskSize.height);
        S2D<unsigned int> hKS( f_maskSize.width/2, 
                               f_maskSize.height/2 );
    
        const int h_i = f_srcImg.getSize().height;
        const int w_i = f_srcImg.getSize().width;
    
        const double norm_f = kS.width * (kS.height-1) * f_norm_d;
    
        /// Compute first vector sum.
        for (int i = hKS.height; i < h_i - (int)hKS.height; ++i)
        {
            for (int j = hKS.width ; j < w_i- (int)hKS.width; ++j)
            {
                _DstType sum_f(0);
                for (int l = j-hKS.width; l <= j+(int)hKS.width; ++l)
                {
                    for (int k = i-(int)hKS.height ; k < i; ++k)
                        sum_f += f_srcImg.getScanline(k)[l]/((double)i-k);
                
                    for (int k = i+1 ; k <= i+(int)hKS.height; ++k)
                        sum_f -= f_srcImg.getScanline(k)[l]/((double)k-i);                
                }

                fr_dstImg.getScanline(i)[j] = (_DstType) (sum_f/norm_f);
                //printf ("PrewV[%03i][%03i] = %f\n", i, j,  sum_f/norm_f );
            }
        }
    }
    else
    {
        /// DEFAULT STANDARD NON-OPTIMIZED IMPLEMENTATION.
        /// FOR DEBUGING PURPOSES.
        S2D<int> kS ( f_maskSize.width,f_maskSize.height);
        S2D<unsigned int> hKS( f_maskSize.width/2, 
                               f_maskSize.height/2 );
    
        const int h_i = f_srcImg.getSize().height;
        const int w_i = f_srcImg.getSize().width;
    
    
        /// Compute first vector sum.
        for (int i = hKS.height; i < h_i - (int)hKS.height; ++i)
        {
            for (int j = hKS.width ; j < w_i- (int)hKS.width; ++j)
            {
                double w;
                double sum_f = 0;
                double norm_f = 0.;
                for (int l = j-hKS.width; l <= j+(int)hKS.width; ++l)
                {
                    for (int k = i-(int)hKS.height ; k < i; ++k)
                    {
                        w = exp(-(l-j)*(l-j) - (k-i)*(k-i)/(kS.width*kS.height));
                        norm_f += w;
                        sum_f += w * f_srcImg.getScanline(k)[l]/((double)i-k);
                    }
                    
                    for (int k = i+1 ; k <= i+(int)hKS.height; ++k)
                    {
                        w = exp(-(l-j)*(l-j) - (k-i)*(k-i)/(kS.width*kS.height));
                        norm_f += w;
                        sum_f -= w * f_srcImg.getScanline(k)[l]/((double)k-i);
                    }
                }

                fr_dstImg.getScanline(i)[j] =  (_DstType) (sum_f/(f_norm_d*norm_f));
                //printf ("PrewV[%03i][%03i] = %f\n", i, j,  sum_f/norm_f );
            }
        }
    }
    
#endif

    return true;
}

template <class _SrcType, class _DstType>
inline bool
CPrewittFilter<_SrcType, _DstType>::computeVerticalIPP ( const CTypedImage<_SrcType> &f_srcImg,
                                                         CTypedImage<_DstType>       &fr_dstImg,
                                                         CTypedImage<_DstType>       &fr_buffer,
                                                         const S2D<unsigned int>     f_maskSize,
                                                         const _DstType              f_norm_d )
{
    return false;
}


template <>
inline bool
CPrewittFilter<float, float>::computeVerticalIPP ( const CTypedImage<float>       &f_srcImg,
                                                   CTypedImage<float>       &fr_dstImg,
                                                   CTypedImage<float>       &fr_buffer,
                                                   const S2D<unsigned int>  f_maskSize,
                                                   const float              f_norm_d )
{
    if ( fr_dstImg.getWidth()  != f_srcImg.getWidth() ||
         fr_dstImg.getHeight() != f_srcImg.getHeight() )
        return false;

    if ( fr_buffer.getWidth() != f_srcImg.getWidth() ||
         fr_buffer.getHeight() != f_srcImg.getHeight() )
    {
        return false;
    }

    S2D<int> kS ( f_maskSize.width,f_maskSize.height);
    S2D<int> hKS( f_maskSize.width/2, 
                  f_maskSize.height/2 );

    int h_i = f_srcImg.getSize().height;
    int w_i = f_srcImg.getSize().width;

    const float scale_f = 1./(kS.height * (kS.width-1) * f_norm_d);

    IppiSize dstRoiSize;
    dstRoiSize.width  = w_i - kS.width + 1;
    dstRoiSize.height = h_i;

    float kernel_p[256];
    
    for (int i = -hKS.width; i <= hKS.width; ++i)
        if (i) 
            kernel_p[i+hKS.width] = -1./i;
        else 
            kernel_p[hKS.width] = 0;
    
    IppStatus status = ippiFilterRow_32f_C1R ( f_srcImg.getScanline(0) + hKS.width, 
                                               w_i * sizeof(float),
                                               fr_buffer.getScanline(0) + hKS.width, 
                                               w_i * sizeof(float),
                                               dstRoiSize,
                                               kernel_p, 
                                               kS.width,
                                               hKS.width );

    if ( status != ippStsNoErr)
    {
        printf("IPP failed:\n");
        printIPPError( status );
        return false;
    }

    for (int i = 0; i < kS.height; ++i)
        kernel_p[i] = scale_f;

    dstRoiSize.width  = w_i - kS.width  + 1;
    dstRoiSize.height = h_i - kS.height + 1;

    status = ippiFilterColumn_32f_C1R ( fr_buffer.getScanline(hKS.height) + hKS.width, 
                                        w_i * sizeof(float),
                                        fr_dstImg.getScanline(hKS.height) + hKS.width, 
                                        w_i * sizeof(float),
                                        dstRoiSize,
                                        kernel_p,
                                        kS.height,
                                        hKS.height );

    if ( status != ippStsNoErr)
    {
        printf("IPP failed:\n");
        printIPPError( status );
        return false;
    }

    return true;
}



template <class _SrcType, class _DstType>
inline bool
CPrewittFilter<_SrcType, _DstType>::computeVertical ( const CTypedImage<_SrcType> &f_srcImg,
                                                      CTypedImage<_DstType>       &fr_dstImg,
                                                      CTypedImage<_DstType>       &fr_buffer,
                                                      const S2D<unsigned int>     f_maskSize,
                                                      const _DstType              f_norm_d )
{
    double start_d = 0;
    double end_d = 0;

    if ( fr_dstImg.getWidth()  != f_srcImg.getWidth() ||
         fr_dstImg.getHeight() != f_srcImg.getHeight() )
        return false;

    if ( fr_buffer.getWidth() != f_srcImg.getWidth() ||
         fr_buffer.getHeight() != f_srcImg.getHeight() )
    {
        return false;
    }
    
    if ( f_maskSize.width  == 3 &&
         f_maskSize.height == 3 )
        return computeVertical3x3 ( f_srcImg, 
                                    fr_dstImg, 
                                    fr_buffer, 
                                    f_norm_d  );

    S2D<int> kS ( f_maskSize.width,f_maskSize.height);
    S2D<int> hKS( f_maskSize.width/2, 
                  f_maskSize.height/2 );

    int h_i = f_srcImg.getSize().height;
    int w_i = f_srcImg.getSize().width;

#if defined ( _OPENMP )
    start_d = omp_get_wtime();
#endif

    const _DstType norm_f = kS.height * (kS.width-1) * f_norm_d;

#if defined ( _OPENMP )
    const int numThreads_ui = omp_get_max_threads();
#pragma omp parallel for num_threads(numThreads_ui) schedule(dynamic)
#endif
    for (int i = 0; i < h_i; ++i)
    {
        _SrcType * src_p  = f_srcImg.getScanline(i);
        _DstType * aux_p  = fr_buffer.getScanline(i)+hKS.width;
        
        for (int j = hKS.width; j < w_i - hKS.width; ++j, ++src_p, ++aux_p)
        {
            _SrcType *p = src_p;
            
            *aux_p = 0;
            _DstType val = -hKS.width;
            
            int k;
            for (k = 0; k < hKS.width; ++k, ++p, ++val)
            {
                *aux_p += (_DstType) (*p)/(val);
            }
            
            for (++k, ++p, ++val; k < kS.width; ++k, ++p, ++val)
            {
                *aux_p += (_DstType) (*p)/(val);
            }
        }
    }    

#if defined ( _OPENMP )
        end_d = omp_get_wtime();
        printf("VER First convolution %f milliseconds\n",
               (end_d-start_d)*1000.);
#endif

#if defined ( _OPENMP )
    start_d = omp_get_wtime();
#endif

#if defined ( _OPENMP )
#pragma omp parallel for num_threads(numThreads_ui) schedule(dynamic)
#endif
    for ( int j = hKS.width; j < w_i - hKS.width; ++j)
    {
        // Initial sum;
        _DstType sum = 0;
        _DstType *ptrU_p = fr_buffer.getScanline(0) + j;
        _DstType *ptrD_p = ptrU_p;
        
        for (int i = 0; i < kS.height; ++i, ptrD_p += w_i)
        {
            sum += *ptrD_p;
        }
        
        _DstType *dst_p = fr_dstImg.getScanline(hKS.height) + j;

        for (int i = hKS.height; i < h_i - hKS.height - 1; ++i, dst_p+=w_i, ptrU_p+=w_i, ptrD_p+=w_i)
        {
            *dst_p = sum/norm_f;
            sum += *ptrD_p - *ptrU_p;
        }

        *dst_p = sum/norm_f;
    }

#if defined ( _OPENMP )
        end_d = omp_get_wtime();
        printf("VER Second convolution %f milliseconds\n",
               (end_d-start_d)*1000.);
#endif

    return true;
}

template <class _SrcType, class _DstType>
inline bool
CPrewittFilter<_SrcType, _DstType>::computeHorizontal ( const CTypedImage<_SrcType> &f_srcImg,
                                                        CTypedImage<_DstType>       &fr_dstImg,
                                                        CTypedImage<_DstType>       &fr_buffer,
                                                        const S2D<unsigned int>     f_maskSize,
                                                        const _DstType              f_norm_d )
{
    double start_d = 0;
    double end_d = 0;

    if ( fr_dstImg.getWidth()  != f_srcImg.getWidth() ||
         fr_dstImg.getHeight() != f_srcImg.getHeight() )
        return false;

    if ( fr_buffer.getWidth()  != f_srcImg.getWidth() ||
         fr_buffer.getHeight() != f_srcImg.getHeight() )
    {
        return false;
    }

    if ( f_maskSize.width  == 3 &&
         f_maskSize.height == 3 )
        return computeHorizontal3x3 ( f_srcImg, 
                                      fr_dstImg, 
                                      fr_buffer, 
                                      f_norm_d );

    const S2D<int> kS ( f_maskSize.width,f_maskSize.height);
    const S2D<int> hKS( f_maskSize.width/2, 
                        f_maskSize.height/2 );
    
    const int h_i = f_srcImg.getSize().height;
    const int w_i = f_srcImg.getSize().width;

    const _DstType norm_f = kS.width * (kS.height-1) * f_norm_d;

#if defined ( _OPENMP )
    start_d = omp_get_wtime();
#endif

#if defined ( _OPENMP )
    const int numThreads_ui = omp_get_max_threads();
#pragma omp parallel for num_threads(numThreads_ui) schedule(dynamic)
#endif
    for (int j = 0; j < w_i; ++j)
    {
        _SrcType * src_p  = f_srcImg.getScanline(0) + j;
        _DstType * aux_p  = fr_buffer.getScanline(hKS.height) + j;
        
        for (int i = hKS.height; i < h_i - hKS.height; ++i, src_p+=w_i, aux_p+=w_i)
        {
            _SrcType *p = src_p;
            
            *aux_p = 0;
            _DstType val = hKS.height;
            
            int k;
            for (k = 0; k < hKS.height; ++k, p+=w_i, --val)
            {
                *aux_p += (_DstType) (*p)/(val);
            }
            
            for (++k, p+=w_i, --val; k < kS.height; ++k, p+=w_i, --val)
            {
                *aux_p += (_DstType) (*p)/(val);
            }
        }
    }

#if defined ( _OPENMP )
        end_d = omp_get_wtime();
        printf("HOR First convolution %f milliseconds\n",
               (end_d-start_d)*1000.);
#endif

#if defined ( _OPENMP )
    start_d = omp_get_wtime();
#endif

#if defined ( _OPENMP )
#pragma omp parallel for num_threads(numThreads_ui) schedule(dynamic)
#endif
    for ( int i = hKS.height; i < h_i - hKS.height; ++i)
    {
        // Initial sum;
        _DstType sum = 0;
        _DstType *ptrL_p = fr_buffer.getScanline(i);
        _DstType *ptrR_p = ptrL_p;
        
        for (int j = 0; j < kS.width; ++j, ++ptrR_p)
        {
            sum += *ptrR_p;
        }
        
        _DstType *dst_p = fr_dstImg.getScanline(i) + hKS.width;

        for (int j = hKS.width; j < w_i - hKS.width - 1; ++j, ++dst_p, ++ptrL_p, ++ptrR_p)
        {
            *dst_p = sum/norm_f;
            sum += *ptrR_p - *ptrL_p;
        }

        *dst_p = sum/norm_f;
    }

#if defined ( _OPENMP )
        end_d = omp_get_wtime();
        printf("HOR Second convolution %f milliseconds\n",
               (end_d-start_d)*1000.);
#endif

    return true;
}


template <class _SrcType, class _DstType>
inline bool
CPrewittFilter<_SrcType, _DstType>::computeVertical3x3 ( const CTypedImage<_SrcType> &f_srcImg,
                                                         CTypedImage<_DstType>       &fr_dstImg,
                                                         CTypedImage<_DstType>       &fr_buffer,
                                                         const _DstType              f_norm_d )
{
    int h_i = f_srcImg.getSize().height;
    int w_i = f_srcImg.getSize().width;

    const _DstType norm_f = 6 * f_norm_d;

#if defined ( _OPENMP )
    const int numThreads_ui = omp_get_max_threads();
#pragma omp parallel for num_threads(numThreads_ui) schedule(dynamic)
#endif
    for (int i = 0; i < h_i; ++i)
    {
        _SrcType * src_p  = f_srcImg.getScanline(i);
        _DstType * aux_p  = fr_buffer.getScanline(i)+1;
        
        for (int j = 1; j < w_i - 1; ++j, ++src_p, ++aux_p)
        {
            *aux_p = *(src_p+2) - *(src_p);
        }
    }    

#if defined ( _OPENMP )
#pragma omp parallel for num_threads(numThreads_ui) schedule(dynamic)
#endif
    for ( int j = 1; j < w_i - 1; ++j)
    {
        // Initial sum;
        _DstType sum;
        _DstType *ptrU_p = fr_buffer.getScanline(0) + j;
        _DstType *ptrD_p = ptrU_p;
        
        sum  = *(ptrD_p); ptrD_p+=w_i;
        sum += *(ptrD_p); ptrD_p+=w_i;
        sum += *(ptrD_p); ptrD_p+=w_i;
        
        _DstType *dst_p = fr_dstImg.getScanline(1) + j;

        for (int i = 1; i < h_i - 2; ++i, dst_p+=w_i, ptrU_p+=w_i, ptrD_p+=w_i)
        {
            *dst_p = sum/norm_f;
            sum += *ptrD_p - *ptrU_p;
        }

        *dst_p = sum/norm_f;
    }

    return true;
}

template <class _SrcType, class _DstType>
inline bool
CPrewittFilter<_SrcType, _DstType>::computeHorizontal3x3 ( const CTypedImage<_SrcType> &f_srcImg,
                                                           CTypedImage<_DstType>       &fr_dstImg,
                                                           CTypedImage<_DstType>       &fr_buffer,
                                                           const _DstType              f_norm_d )
{
    int h_i = f_srcImg.getSize().height;
    int w_i = f_srcImg.getSize().width;

    const _DstType norm_f = 6 * f_norm_d;

#if defined ( _OPENMP )
    const int numThreads_ui = omp_get_max_threads();
#pragma omp parallel for num_threads(numThreads_ui) schedule(dynamic)
#endif
    for (int j = 0; j < w_i; ++j)
    {
        _SrcType * src_p  = f_srcImg.getScanline(0) + j;
        _DstType * aux_p  = fr_buffer.getScanline(1)+j;
        
        for (int i = 1; i < h_i - 1; ++i, src_p+=w_i, aux_p+=w_i)
        {
            *aux_p = *(src_p) - *(src_p);
        }
    }

#if defined ( _OPENMP )
#pragma omp parallel for num_threads(numThreads_ui) schedule(dynamic)
#endif
    for ( int i = 1; i < h_i - 1; ++i)
    {
        // Initial sum;
        _DstType sum;
        _DstType *ptrL_p = fr_buffer.getScanline(i);
        _DstType *ptrR_p = ptrL_p;
        
        sum  = *(ptrR_p); ++ptrR_p;
        sum += *(ptrR_p); ++ptrR_p;
        sum += *(ptrR_p); ++ptrR_p;
        
        _DstType *dst_p = fr_dstImg.getScanline(i) + 1;

        for (int j = 1; j < w_i - 2; ++j, ++dst_p, ++ptrL_p, ++ptrR_p)
        {
            *dst_p = sum/norm_f;
            sum += *ptrR_p - *ptrL_p;
        }

        *dst_p = sum/norm_f;
    }

    return true;
}

template <class _SrcType, class _DstType>
inline bool
CPrewittFilter<_SrcType, _DstType>::computeVertical3x3 ( const CTypedImage<_SrcType> &f_srcImg,
                                                         CTypedImage<_DstType>       &fr_dstImg,
                                                         const double                f_norm_d )
{
    if ( fr_dstImg.getWidth()  < f_srcImg.getWidth() ||
         fr_dstImg.getHeight() < f_srcImg.getHeight() ||
         f_srcImg.getHeight() < 3 )
        return false;

    const int h_i = f_srcImg.getSize().height;
    const int w_i = f_srcImg.getSize().width;

    const double norm_f = 6. * f_norm_d;
        
#if defined ( _OPENMP )
    const int numThreads_i = omp_get_max_threads();
    
    int numP_i = numThreads_i;
    /// Check how many processes must start according to the height of the image.
    if ( h_i / numThreads_i < 3 )
        numP_i = std::min(std::max(1, h_i / 3 ), numThreads_i);    

    int hp_i = h_i / numP_i;
    _DstType * sumsVector_p = new _DstType[w_i*numP_i];

#pragma omp parallel for num_threads(numP_i) schedule(dynamic)
    for (int p = 0; p < numP_i; ++p)
    {
        int rowBot_i, rowTop_i;

        rowTop_i = std::max(0,  p * hp_i - (int) 1);
        if ( p == (numP_i-1) )
            rowBot_i = h_i-1;
        else
            rowBot_i = ((p+1) * hp_i - 1) + 1;

        _DstType * sums_p = sumsVector_p + p * w_i;
#else
    _DstType * sums_p = new _DstType[w_i];
    int rowTop_i = 0;
    int rowBot_i = h_i-1;
    {
#endif
        /// Compute first vector sum.
        for (int j = 0 ; j < w_i; ++j)
        {
            int i = rowTop_i;
            sums_p[j]  = f_srcImg.getScanline(i++)[j];
            sums_p[j] += f_srcImg.getScanline(i++)[j];
            sums_p[j] += f_srcImg.getScanline(i  )[j];
        }

        for (int i = rowTop_i; i <= rowBot_i-3; ++i)
        {
            _SrcType * range1_p = f_srcImg.getScanline(i+3);
            _SrcType * range2_p = f_srcImg.getScanline(i);
            _DstType * ptrsum_p = sums_p;

            /// Make first sums.
            _DstType * ptrL_p = sums_p;
            _DstType * ptrR_p = sums_p + 2;
        
            _DstType *dst_p = fr_dstImg.getScanline(i+1) + 1;

            for (int j = 1; j < (int)(w_i - 1); 
                 ++j, ++dst_p, ++ptrL_p, ++ptrR_p, ++range1_p, ++range2_p, ++ptrsum_p)
            {
                *dst_p = (_DstType) ((*ptrR_p - *ptrL_p)/norm_f);

                *ptrsum_p += *range1_p - *range2_p;
            }

            for (int j = 0; j < (int)2; ++j, ++range1_p, ++range2_p, ++ptrsum_p)
            {
                *ptrsum_p += *range1_p - *range2_p;
            }
        }

        // process now last row without updating sums.
        /// Make first sums.

        _DstType * ptrL_p = sums_p;
        _DstType * ptrR_p = sums_p + 2;

        _DstType *dst_p = fr_dstImg.getScanline(rowBot_i-1) + 1;

        for (int j = 1; j < (int)(w_i - 1); ++j, ++dst_p, ++ptrL_p, ++ptrR_p)
        {
            *dst_p = (_DstType) (( *ptrR_p - *ptrL_p)/norm_f);
            //printf ("PrewU[%i][%i] = %f\n", h_i-1-1, j,  (sumR_f - sumL_f)/norm_f );
        }
    }
    
#if defined ( _OPENMP )
    delete [] sumsVector_p;
#else
    delete [] sums_p;
#endif

    return true;    
}
    
template <class _SrcType, class _DstType>
inline bool
CPrewittFilter<_SrcType, _DstType>::computeHorizontal3x3 ( const CTypedImage<_SrcType> &f_srcImg,
                                                           CTypedImage<_DstType>       &fr_dstImg,
                                                           const double                f_norm_d )
{
    if ( fr_dstImg.getWidth()  < f_srcImg.getWidth() ||
         fr_dstImg.getHeight() < f_srcImg.getHeight() ||
         f_srcImg.getWidth() < 3 )
        return false;

    const int h_i = f_srcImg.getSize().height;
    const int w_i = f_srcImg.getSize().width;

    const double norm_f = 6. * f_norm_d;

#if defined ( _OPENMP )
    const int numThreads_i = omp_get_max_threads();
    
    int numP_i = numThreads_i;
    /// Check how many processes must start according to the height of the image.
    if ( w_i / numThreads_i < 3 )
        numP_i = std::min(std::max(1, w_i / 3 ), numThreads_i);    

    int wp_i = w_i / numP_i;
    _DstType * sumsVector_p = new _DstType[h_i*numP_i];

#pragma omp parallel for num_threads(numP_i) schedule(dynamic)
    for (int p = 0; p < numP_i; ++p)
    {
        int colLef_i, colRig_i;

        colLef_i = std::max(0,  p * wp_i - (int) 1);
        if ( p == (numP_i-1) )
            colRig_i = w_i-1;
        else
            colRig_i = ((p+1) * wp_i - 1) + 1;

        _DstType * sums_p = sumsVector_p + p * h_i;
#else
    _DstType * sums_p = new _DstType[h_i];
    int colLef_i = 0;
    int colRig_i = w_i-1;
    {
#endif
    
        /// Compute first vector sum.
        for (int i = 0; i < h_i; ++i)
        {
            int j = colLef_i;
            sums_p[i]  = f_srcImg.getScanline(i)[j++];
            sums_p[i] += f_srcImg.getScanline(i)[j++];
            sums_p[i] += f_srcImg.getScanline(i)[j  ];
        }
                
        for (int j = colLef_i; j <= colRig_i-3; ++j)
        {
            _SrcType * range1_p = f_srcImg.getScanline(0) + j+3;
            _SrcType * range2_p = f_srcImg.getScanline(0) + j;
            _DstType * ptrsum_p = sums_p;

            /// Make first sums.
            _DstType * ptrT_p = sums_p;
            _DstType * ptrB_p = sums_p + 2;

            _DstType *dst_p = fr_dstImg.getScanline(1) + j+1;
     
            for (int i = 1; i < (int)(h_i - 1); ++i, 
                     dst_p+=w_i, ++ptrT_p, ++ptrB_p, range1_p+=w_i, range2_p+=w_i, ++ptrsum_p)
            {
                *dst_p =  (_DstType) ((*ptrT_p - *ptrB_p)/norm_f);

                *ptrsum_p += *range1_p - *range2_p;
            }

            for (int i = 0 ; i < 3-1; ++i, range1_p+=w_i, range2_p+=w_i, ++ptrsum_p)
            {
                *ptrsum_p += *range1_p - *range2_p;
            }
        }

        // process now last column without updating sums.
        /// Make first sums.
        _DstType * ptrT_p = sums_p;
        _DstType * ptrB_p = sums_p + 2;
                
        _DstType *dst_p = fr_dstImg.getScanline(1) + colRig_i - 1;

        for (int i = 1; i < (int)(h_i - 1); ++i, dst_p+=w_i, ++ptrT_p, ++ptrB_p)
        {
            *dst_p =  (_DstType) ((*ptrT_p - *ptrB_p)/norm_f);
            //printf ("PrewV[%03i][%03i] = %f\n", i, w_i-1-1,  (sumT_f - sumB_f)/norm_f );
        }

    }
    
#if defined ( _OPENMP )
    delete [] sumsVector_p;
#else
    delete [] sums_p;
#endif

    return true;
}

template <class _SrcType, class _DstType>
inline bool
CPrewittFilter<_SrcType, _DstType>::computeVertical5x5 ( const CTypedImage<_SrcType> &f_srcImg,
                                                         CTypedImage<_DstType>       &fr_dstImg,
                                                         const double                f_norm_d )
{
    if ( fr_dstImg.getWidth()  < f_srcImg.getWidth() ||
         fr_dstImg.getHeight() < f_srcImg.getHeight() ||
         f_srcImg.getHeight() < 5 )
        return false;

    const int h_i = f_srcImg.getSize().height;
    const int w_i = f_srcImg.getSize().width;

    const double norm_f = 20. * f_norm_d;
        
#if defined ( _OPENMP )
    const int numThreads_i = omp_get_max_threads();
    
    int numP_i = numThreads_i;
    /// Check how many processes must start according to the height of the image.
    if ( h_i / numThreads_i < 5 )
        numP_i = std::min(std::max(1, h_i / 5 ), numThreads_i);    

    int hp_i = h_i / numP_i;
    _DstType * sumsVector_p = new _DstType[w_i*numP_i];

#pragma omp parallel for num_threads(numP_i) schedule(dynamic)
    for (int p = 0; p < numP_i; ++p)
    {
        int rowBot_i, rowTop_i;

        rowTop_i = std::max(0,  p * hp_i - (int) 2);
        if ( p == (numP_i-1) )
            rowBot_i = h_i-1;
        else
            rowBot_i = ((p+1) * hp_i - 1) + 2;

        _DstType * sums_p = sumsVector_p + p * w_i;

        //printf("Computing Vertical prewitt (hor gradient) from rowBot_i = %i to rowTop_i = %i with 5 = %i and h_i = %i numThreads_i = %i and maxP = %i\n",
        //       rowBot_i, rowTop_i, 5, h_i, numThreads_i, numP_i );
#else
    _DstType * sums_p = new _DstType[w_i];
    int rowTop_i = 0;
    int rowBot_i = h_i-1;
    {
#endif
        /// Compute first vector sum.
        for (int j = 0 ; j < w_i; ++j)
        {
            int i = rowTop_i;
            sums_p[j]  = f_srcImg.getScanline(i++)[j];
            sums_p[j] += f_srcImg.getScanline(i++)[j];
            sums_p[j] += f_srcImg.getScanline(i++)[j];
            sums_p[j] += f_srcImg.getScanline(i++)[j];
            sums_p[j] += f_srcImg.getScanline(i  )[j];
        }

        for (int i = rowTop_i; i <= rowBot_i-5; ++i)
        {
            _SrcType * range1_p = f_srcImg.getScanline(i+5);
            _SrcType * range2_p = f_srcImg.getScanline(i);
            _DstType * ptrsum_p = sums_p;

            _DstType  sumL_f;
            _DstType  sumR_f;

            /// Make first sums.
            _DstType * ptrL_p = sums_p;
            _DstType * ptrR_p = sums_p + 4;
        
            _DstType *dst_p = fr_dstImg.getScanline(i+2) + 2;

            for (int j = 2; j < (int)(w_i - 2); 
                 ++j, ++dst_p, ++ptrL_p, ++ptrR_p, ++range1_p, ++range2_p, ++ptrsum_p)
            {
                sumL_f = *ptrL_p/2 + ptrL_p[ 1];
                sumR_f = *ptrR_p/2 + ptrR_p[-1];

                *dst_p = (_DstType) ((sumR_f - sumL_f)/norm_f);
                        
                *ptrsum_p += *range1_p - *range2_p;
            }

            for (int j = 0; j < (int)4; ++j, ++range1_p, ++range2_p, ++ptrsum_p)
            {
                *ptrsum_p += *range1_p - *range2_p;
            }
        }

        // process now last row without updating sums.
        _DstType sumL_f;
        _DstType sumR_f;
     
        /// Make first sums.
        _DstType * ptrL_p = sums_p;
        _DstType * ptrR_p = sums_p + 4;

        _DstType *dst_p = fr_dstImg.getScanline(rowBot_i-2) + 2;

        for (int j = 2; j < (int)(w_i - 2); ++j, ++dst_p, ++ptrL_p, ++ptrR_p)
        {
            sumL_f = *ptrL_p/2 + ptrL_p[ 1];
            sumR_f = *ptrR_p/2 + ptrR_p[-1];

            *dst_p = (_DstType) ((sumR_f - sumL_f)/norm_f);
        }
    }
    
#if defined ( _OPENMP )
    delete [] sumsVector_p;
#else
    delete [] sums_p;
#endif

    return true;
}

template <class _SrcType, class _DstType>
inline bool
CPrewittFilter<_SrcType, _DstType>::computeHorizontal5x5 ( const CTypedImage<_SrcType> &f_srcImg,
                                                           CTypedImage<_DstType>       &fr_dstImg,
                                                           const double                f_norm_d )
{
    if ( fr_dstImg.getWidth()  < f_srcImg.getWidth() ||
         fr_dstImg.getHeight() < f_srcImg.getHeight() ||
         f_srcImg.getWidth() < 5 )
        return false;

    const int h_i = f_srcImg.getSize().height;
    const int w_i = f_srcImg.getSize().width;

    const double norm_f = 20 * f_norm_d;

#if defined ( _OPENMP )
    const int numThreads_i = omp_get_max_threads();
    
    int numP_i = numThreads_i;
    /// Check how many processes must start according to the height of the image.
    if ( w_i / numThreads_i < 5 )
        numP_i = std::min(std::max(1, w_i / 5 ), numThreads_i);    

    int wp_i = w_i / numP_i;
    _DstType * sumsVector_p = new _DstType[h_i*numP_i];

#pragma omp parallel for num_threads(numP_i) schedule(dynamic)
    for (int p = 0; p < numP_i; ++p)
    {
        int colLef_i, colRig_i;

        colLef_i = std::max(0,  p * wp_i - (int) 2);
        if ( p == (numP_i-1) )
            colRig_i = w_i-1;
        else
            colRig_i = ((p+1) * wp_i - 1) + 2;

        _DstType * sums_p = sumsVector_p + p * h_i;

        //printf("Computing Horizontal prewitt (ver gradient) from colLef_i = %i to colRig_i = %i with 5 = %i and w_i = %i numThreads_i = %i and maxP = %i\n",
        //       colLef_i, colRig_i, 5, w_i, numThreads_i, numP_i );
#else
    _DstType * sums_p = new _DstType[h_i];
    int colLef_i = 0;
    int colRig_i = w_i-1;
    {
#endif

        /// Compute first vector sum.
        for (int i = 0; i < h_i; ++i)
        {
            int j = colLef_i;
            sums_p[i]  = f_srcImg.getScanline(i)[j++];
            sums_p[i] += f_srcImg.getScanline(i)[j++];
            sums_p[i] += f_srcImg.getScanline(i)[j++];
            sums_p[i] += f_srcImg.getScanline(i)[j++];
            sums_p[i] += f_srcImg.getScanline(i)[j  ];
        }
                
        for (int j = colLef_i; j <= colRig_i-5; ++j)
        {
            _SrcType * range1_p = f_srcImg.getScanline(0) + j+5;
            _SrcType * range2_p = f_srcImg.getScanline(0) + j;
            _DstType * ptrsum_p = sums_p;

            _DstType sumT_f;
            _DstType sumB_f;

            /// Make first sums.
            _DstType * ptrT_p = sums_p;
            _DstType * ptrB_p = sums_p + 4;

            _DstType *dst_p = fr_dstImg.getScanline(2) + j+2;
     
            for (int i = 2; i < (int)(h_i - 2); ++i, 
                     dst_p+=w_i, ++ptrT_p, ++ptrB_p, range1_p+=w_i, range2_p+=w_i, ++ptrsum_p)
            {
                sumT_f = *ptrT_p/2 + ptrT_p[ 1];
                sumB_f = *ptrB_p/2 + ptrB_p[-1];
            
                *dst_p =  (_DstType) ((sumT_f - sumB_f)/norm_f);

                *ptrsum_p += *range1_p - *range2_p;
            }

            for (int i = 0 ; i < 4; ++i, range1_p+=w_i, range2_p+=w_i, ++ptrsum_p)
            {
                *ptrsum_p += *range1_p - *range2_p;
            }
        }

        // process now last column without updating sums.
        _DstType sumT_f;
        _DstType sumB_f;
                
        /// Make first sums.
        _DstType * ptrT_p = sums_p;
        _DstType * ptrB_p = sums_p + 4;
                
        _DstType *dst_p = fr_dstImg.getScanline(2) + colRig_i - 2;

        for (int i = 2; i < (int)(h_i - 2); ++i, dst_p+=w_i, ++ptrT_p, ++ptrB_p)
        {
            sumT_f = *ptrT_p/2 + ptrT_p[ 1];
            sumB_f = *ptrB_p/2 + ptrB_p[-1];
            
            *dst_p =  (_DstType) ((sumT_f - sumB_f)/norm_f);
        }

    }
    
#if defined ( _OPENMP )
    delete [] sumsVector_p;
#else
    delete [] sums_p;
#endif

    return true;    
}
    
template <class _SrcType, class _DstType>
inline bool
CPrewittFilter<_SrcType, _DstType>::computeVertical7x7 ( const CTypedImage<_SrcType> &f_srcImg,
                                                         CTypedImage<_DstType>       &fr_dstImg,
                                                         const double                f_norm_d )
{
    if ( fr_dstImg.getWidth()  < f_srcImg.getWidth() ||
         fr_dstImg.getHeight() < f_srcImg.getHeight() ||
         f_srcImg.getHeight() < 7 )
        return false;

    const int h_i = f_srcImg.getSize().height;
    const int w_i = f_srcImg.getSize().width;

    const double norm_f = 42. * f_norm_d;
        
#if defined ( _OPENMP )
    const int numThreads_i = omp_get_max_threads();
    
    int numP_i = numThreads_i;
    /// Check how many processes must start according to the height of the image.
    if ( h_i / numThreads_i < 7 )
        numP_i = std::min(std::max(1, h_i / 7 ), numThreads_i);    

    int hp_i = h_i / numP_i;
    _DstType * sumsVector_p = new _DstType[w_i*numP_i];

#pragma omp parallel for num_threads(numP_i) schedule(dynamic)
    for (int p = 0; p < numP_i; ++p)
    {
        int rowBot_i, rowTop_i;

        rowTop_i = std::max(0,  p * hp_i - (int) 3);
        if ( p == (numP_i-1) )
            rowBot_i = h_i-1;
        else
            rowBot_i = ((p+1) * hp_i - 1) + 3;

        _DstType * sums_p = sumsVector_p + p * w_i;

        //printf("Computing Vertical prewitt (hor gradient) from rowBot_i = %i to rowTop_i = %i with 7 = %i and h_i = %i numThreads_i = %i and maxP = %i\n",
        //       rowBot_i, rowTop_i, 7, h_i, numThreads_i, numP_i );
#else
    _DstType * sums_p = new _DstType[w_i];
    int rowTop_i = 0;
    int rowBot_i = h_i-1;
    {
#endif
        /// Compute first vector sum.
        for (int j = 0 ; j < w_i; ++j)
        {
            int i = rowTop_i;
            sums_p[j]  = f_srcImg.getScanline(i++)[j];
            sums_p[j] += f_srcImg.getScanline(i++)[j];
            sums_p[j] += f_srcImg.getScanline(i++)[j];
            sums_p[j] += f_srcImg.getScanline(i++)[j];
            sums_p[j] += f_srcImg.getScanline(i++)[j];
            sums_p[j] += f_srcImg.getScanline(i++)[j];
            sums_p[j] += f_srcImg.getScanline(i  )[j];
        }

        for (int i = rowTop_i; i <= rowBot_i-7; ++i)
        {
            _SrcType * range1_p = f_srcImg.getScanline(i+7);
            _SrcType * range2_p = f_srcImg.getScanline(i);
            _DstType * ptrsum_p = sums_p;

            _DstType  sumL_f;
            _DstType  sumR_f;

            /// Make first sums.
            _DstType * ptrL_p = sums_p;
            _DstType * ptrR_p = sums_p + 6;
        
            _DstType *dst_p = fr_dstImg.getScanline(i+3) + 3;

            for (int j = 3; j < (int)(w_i - 3); 
                 ++j, ++dst_p, ++ptrL_p, ++ptrR_p, ++range1_p, ++range2_p, ++ptrsum_p)
            {
                sumL_f = *ptrL_p/3 + ptrL_p[ 1]/2 + ptrL_p[ 2];
                sumR_f = *ptrR_p/3 + ptrR_p[-1]/2 + ptrR_p[-2];

                *dst_p = (_DstType) ((sumR_f - sumL_f)/norm_f);

                *ptrsum_p += *range1_p - *range2_p;
            }

            for (int j = 0; j < (int)6; ++j, ++range1_p, ++range2_p, ++ptrsum_p)
            {
                *ptrsum_p += *range1_p - *range2_p;
            }
        }

        // process now last row without updating sums.
        _DstType sumL_f;
        _DstType sumR_f;
     
        /// Make first sums.
        _DstType * ptrL_p = sums_p;
        _DstType * ptrR_p = sums_p + 6;

        _DstType *dst_p = fr_dstImg.getScanline(rowBot_i-3) + 3;

        for (int j = 3; j < (int)(w_i - 3); ++j, ++dst_p, ++ptrL_p, ++ptrR_p)
        {
            /// Compute sums.
            sumL_f = *ptrL_p/3 + ptrL_p[ 1]/2 + ptrL_p[ 2];
            sumR_f = *ptrR_p/3 + ptrR_p[-1]/2 + ptrR_p[-2];
            
            *dst_p = (_DstType) ((sumR_f - sumL_f)/norm_f);
        }
    }
    
#if defined ( _OPENMP )
    delete [] sumsVector_p;
#else
    delete [] sums_p;
#endif

    return true;
}
    

template <class _SrcType, class _DstType>
inline bool
CPrewittFilter<_SrcType, _DstType>::computeHorizontal7x7 ( const CTypedImage<_SrcType> &f_srcImg,
                                                           CTypedImage<_DstType>       &fr_dstImg,
                                                           const double                f_norm_d )
{
    if ( fr_dstImg.getWidth()  < f_srcImg.getWidth() ||
         fr_dstImg.getHeight() < f_srcImg.getHeight() ||
         f_srcImg.getWidth() < 7 )
        return false;

    const int h_i = f_srcImg.getSize().height;
    const int w_i = f_srcImg.getSize().width;

    const double norm_f = 42 * f_norm_d;

#if defined ( _OPENMP )
    const int numThreads_i = omp_get_max_threads();
    
    int numP_i = numThreads_i;
    /// Check how many processes must start according to the height of the image.
    if ( w_i / numThreads_i < 7 )
        numP_i = std::min(std::max(1, w_i / 7 ), numThreads_i);    

    int wp_i = w_i / numP_i;
    _DstType * sumsVector_p = new _DstType[h_i*numP_i];

#pragma omp parallel for num_threads(numP_i) schedule(dynamic)
    for (int p = 0; p < numP_i; ++p)
    {
        int colLef_i, colRig_i;

        colLef_i = std::max(0,  p * wp_i - (int) 3);
        if ( p == (numP_i-1) )
            colRig_i = w_i-1;
        else
            colRig_i = ((p+1) * wp_i - 1) + 3;

        _DstType * sums_p = sumsVector_p + p * h_i;

        //printf("Computing Horizontal prewitt (ver gradient) from colLef_i = %i to colRig_i = %i with 7 = %i and w_i = %i numThreads_i = %i and maxP = %i\n",
        //       colLef_i, colRig_i, 7, w_i, numThreads_i, numP_i );
#else
    _DstType * sums_p = new _DstType[h_i];
    int colLef_i = 0;
    int colRig_i = w_i-1;
    {
#endif
        /// Compute first vector sum.
        for (int i = 0; i < h_i; ++i)
        {
            int j = colLef_i;
            sums_p[i]  = f_srcImg.getScanline(i)[j++];
            sums_p[i] += f_srcImg.getScanline(i)[j++];
            sums_p[i] += f_srcImg.getScanline(i)[j++];
            sums_p[i] += f_srcImg.getScanline(i)[j++];
            sums_p[i] += f_srcImg.getScanline(i)[j++];
            sums_p[i] += f_srcImg.getScanline(i)[j++];
            sums_p[i] += f_srcImg.getScanline(i)[j  ];
        }
                
        for (int j = colLef_i; j <= colRig_i-7; ++j)
        {
            _SrcType * range1_p = f_srcImg.getScanline(0) + j+7;
            _SrcType * range2_p = f_srcImg.getScanline(0) + j;
            _DstType * ptrsum_p = sums_p;

            _DstType sumT_f;
            _DstType sumB_f;

            /// Make first sums.
            _DstType * ptrT_p = sums_p;
            _DstType * ptrB_p = sums_p + 6;

            _DstType *dst_p = fr_dstImg.getScanline(3) + j+3;
     
            for (int i = 3; i < (int)(h_i - 3); ++i, 
                     dst_p+=w_i, ++ptrT_p, ++ptrB_p, range1_p+=w_i, range2_p+=w_i, ++ptrsum_p)
            {
                sumT_f = *ptrT_p/3 + ptrT_p[ 1]/2 + ptrT_p[ 2];
                sumB_f = *ptrB_p/3 + ptrB_p[-1]/2 + ptrB_p[-2];
            
                *dst_p =  (_DstType) ((sumT_f - sumB_f)/norm_f);

                *ptrsum_p += *range1_p - *range2_p;
             }

            for (int i = 0 ; i < 6; ++i, range1_p+=w_i, range2_p+=w_i, ++ptrsum_p)
            {
                *ptrsum_p += *range1_p - *range2_p;
            }
        }

        // process now last column without updating sums.
        _DstType sumT_f;
        _DstType sumB_f;
                
        /// Make first sums.
        _DstType * ptrT_p = sums_p;
        _DstType * ptrB_p = sums_p + 6;
                
        _DstType *dst_p = fr_dstImg.getScanline(3) + colRig_i - 3;

        for (int i = 3; i < (int)(h_i - 3); ++i, dst_p+=w_i, ++ptrT_p, ++ptrB_p)
        {
            /// Compute sums.
            sumT_f = *ptrT_p/3 + ptrT_p[ 1]/2 + ptrT_p[ 2];
            sumB_f = *ptrB_p/3 + ptrB_p[-1]/2 + ptrB_p[-2];
            
            *dst_p =  (_DstType) ((sumT_f - sumB_f)/norm_f);
        }
    }
    
#if defined ( _OPENMP )
    delete [] sumsVector_p;
#else
    delete [] sums_p;
#endif

    return true;
}
    
template <class _SrcType, class _DstType>
inline bool
CPrewittFilter<_SrcType, _DstType>::computeVertical9x9 ( const CTypedImage<_SrcType> &f_srcImg,
                                                         CTypedImage<_DstType>       &fr_dstImg,
                                                         const double                f_norm_d )
{
    if ( fr_dstImg.getWidth()  < f_srcImg.getWidth() ||
         fr_dstImg.getHeight() < f_srcImg.getHeight() ||
         f_srcImg.getHeight() < 9 )
        return false;
    
    const int h_i = f_srcImg.getSize().height;
    const int w_i = f_srcImg.getSize().width;

    const double norm_f = 72. * f_norm_d;
        
#if defined ( _OPENMP )
    const int numThreads_i = omp_get_max_threads();
    
    int numP_i = numThreads_i;
    /// Check how many processes must start according to the height of the image.
    if ( h_i / numThreads_i < 9 )
        numP_i = std::min(std::max(1, h_i / 9 ), numThreads_i);    

    int hp_i = h_i / numP_i;
    _DstType * sumsVector_p = new _DstType[w_i*numP_i];

#pragma omp parallel for num_threads(numP_i) schedule(dynamic)
    for (int p = 0; p < numP_i; ++p)
    {
        int rowBot_i, rowTop_i;

        rowTop_i = std::max(0,  p * hp_i - (int) 4);
        if ( p == (numP_i-1) )
            rowBot_i = h_i-1;
        else
            rowBot_i = ((p+1) * hp_i - 1) + 4;

        _DstType * sums_p = sumsVector_p + p * w_i;

        //printf("Computing Vertical prewitt (hor gradient) from rowBot_i = %i to rowTop_i = %i with 9 = %i and h_i = %i numThreads_i = %i and maxP = %i\n",
        //       rowBot_i, rowTop_i, 9, h_i, numThreads_i, numP_i );
#else
    _DstType * sums_p = new _DstType[w_i];
    int rowTop_i = 0;
    int rowBot_i = h_i-1;
    {
#endif

        /// Compute first vector sum.
        for (int j = 0 ; j < w_i; ++j)
        {
            int i = rowTop_i;
            sums_p[j]  = f_srcImg.getScanline(i++)[j];
            sums_p[j] += f_srcImg.getScanline(i++)[j];
            sums_p[j] += f_srcImg.getScanline(i++)[j];
            sums_p[j] += f_srcImg.getScanline(i++)[j];
            sums_p[j] += f_srcImg.getScanline(i++)[j];
            sums_p[j] += f_srcImg.getScanline(i++)[j];
            sums_p[j] += f_srcImg.getScanline(i++)[j];
            sums_p[j] += f_srcImg.getScanline(i++)[j];
            sums_p[j] += f_srcImg.getScanline(i  )[j];
        }

        for (int i = rowTop_i; i <= rowBot_i-9; ++i)
        {
            _SrcType * range1_p = f_srcImg.getScanline(i+9);
            _SrcType * range2_p = f_srcImg.getScanline(i);
            _DstType * ptrsum_p = sums_p;

            _DstType  sumL_f;
            _DstType  sumR_f;

            /// Make first sums.
            _DstType * ptrL_p = sums_p;
            _DstType * ptrR_p = sums_p + 8;
        
            _DstType *dst_p = fr_dstImg.getScanline(i+4) + 4;

            for (int j = 4; j < (int)(w_i - 4); 
                 ++j, ++dst_p, ++ptrL_p, ++ptrR_p, ++range1_p, ++range2_p, ++ptrsum_p)
            {
                sumL_f = *ptrL_p/4 + ptrL_p[ 1]/3 + ptrL_p[ 2]/2 + ptrL_p[ 3];
                sumR_f = *ptrR_p/4 + ptrR_p[-1]/3 + ptrR_p[-2]/2 + ptrR_p[-3];

                *dst_p = (_DstType) ((sumR_f - sumL_f)/norm_f);

                *ptrsum_p += *range1_p - *range2_p;
            }

            for (int j = 0; j < (int)9-1; ++j, ++range1_p, ++range2_p, ++ptrsum_p)
            {
                *ptrsum_p += *range1_p - *range2_p;
            }
        }

        // process now last row without updating sums.
        _DstType sumL_f;
        _DstType sumR_f;
     
        /// Make first sums.
        _DstType * ptrL_p = sums_p;
        _DstType * ptrR_p = sums_p + 8;

        _DstType *dst_p = fr_dstImg.getScanline(rowBot_i-4) + 4;

        for (int j = 4; j < (int)(w_i - 4); ++j, ++dst_p, ++ptrL_p, ++ptrR_p)
        {
            sumL_f = *ptrL_p/4 + ptrL_p[ 1]/3 + ptrL_p[ 2]/2 + ptrL_p[ 3];
            sumR_f = *ptrR_p/4 + ptrR_p[-1]/3 + ptrR_p[-2]/2 + ptrR_p[-3];
            
            *dst_p = (_DstType) ((sumR_f - sumL_f)/norm_f);
        }
    }
    
#if defined ( _OPENMP )
    delete [] sumsVector_p;
#else
    delete [] sums_p;
#endif

    return true;
}
    
template <class _SrcType, class _DstType>
inline bool
CPrewittFilter<_SrcType, _DstType>::computeHorizontal9x9 ( const CTypedImage<_SrcType> &f_srcImg,
                                                           CTypedImage<_DstType>       &fr_dstImg,
                                                           const double                f_norm_d )
{
    if ( fr_dstImg.getWidth()  < f_srcImg.getWidth() ||
         fr_dstImg.getHeight() < f_srcImg.getHeight() ||
         f_srcImg.getWidth() < 9 )
        return false;

    const int h_i = f_srcImg.getSize().height;
    const int w_i = f_srcImg.getSize().width;

    const double norm_f = 72. * f_norm_d;

#if defined ( _OPENMP )
    const int numThreads_i = omp_get_max_threads();
    
    int numP_i = numThreads_i;
    /// Check how many processes must start according to the height of the image.
    if ( w_i / numThreads_i < 9 )
        numP_i = std::min(std::max(1, w_i / 9 ), numThreads_i);    

    int wp_i = w_i / numP_i;
    _DstType * sumsVector_p = new _DstType[h_i*numP_i];

#pragma omp parallel for num_threads(numP_i) schedule(dynamic)
    for (int p = 0; p < numP_i; ++p)
    {
        int colLef_i, colRig_i;

        colLef_i = std::max(0,  p * wp_i - (int) 4);
        if ( p == (numP_i-1) )
            colRig_i = w_i-1;
        else
            colRig_i = ((p+1) * wp_i - 1) + 4;

        _DstType * sums_p = sumsVector_p + p * h_i;

        //printf("Computing Horizontal prewitt (ver gradient) from colLef_i = %i to colRig_i = %i with 9 = %i and w_i = %i numThreads_i = %i and maxP = %i\n",
        //       colLef_i, colRig_i, 9, w_i, numThreads_i, numP_i );
#else
    _DstType * sums_p = new _DstType[h_i];
    int colLef_i = 0;
    int colRig_i = w_i-1;
    {
#endif
    
        /// Compute first vector sum.
        for (int i = 0; i < h_i; ++i)
        {
            int j = colLef_i;
            sums_p[i]  = f_srcImg.getScanline(i)[j++];
            sums_p[i] += f_srcImg.getScanline(i)[j++];
            sums_p[i] += f_srcImg.getScanline(i)[j++];
            sums_p[i] += f_srcImg.getScanline(i)[j++];
            sums_p[i] += f_srcImg.getScanline(i)[j++];
            sums_p[i] += f_srcImg.getScanline(i)[j++];
            sums_p[i] += f_srcImg.getScanline(i)[j++];
            sums_p[i] += f_srcImg.getScanline(i)[j++];
            sums_p[i] += f_srcImg.getScanline(i)[j  ];
        }

        for (int j = colLef_i; j <= colRig_i-9; ++j)
        {
            _SrcType * range1_p = f_srcImg.getScanline(0) + j+9;
            _SrcType * range2_p = f_srcImg.getScanline(0) + j;
            _DstType * ptrsum_p = sums_p;

            _DstType sumT_f;
            _DstType sumB_f;

            /// Make first sums.
            _DstType * ptrT_p = sums_p;
            _DstType * ptrB_p = sums_p + 8;

            _DstType *dst_p = fr_dstImg.getScanline(4) + j+4;
     
            for (int i = 4; i < (int)(h_i - 4); ++i, 
                     dst_p+=w_i, ++ptrT_p, ++ptrB_p, range1_p+=w_i, range2_p+=w_i, ++ptrsum_p)
            {
                sumT_f = *ptrT_p/4 + ptrT_p[ 1]/3 + ptrT_p[ 2]/2 + ptrT_p[ 3];
                sumB_f = *ptrB_p/4 + ptrB_p[-1]/3 + ptrB_p[-2]/2 + ptrB_p[-3];
            
                *dst_p =  (_DstType) ((sumT_f - sumB_f)/norm_f);

                *ptrsum_p += *range1_p - *range2_p;
            }

            for (int i = 0 ; i < 9-1; ++i, range1_p+=w_i, range2_p+=w_i, ++ptrsum_p)
            {
                *ptrsum_p += *range1_p - *range2_p;
            }
        }

        // process now last column without updating sums.
        _DstType sumT_f;
        _DstType sumB_f;
                
        /// Make first sums.
        _DstType * ptrT_p = sums_p;
        _DstType * ptrB_p = sums_p + 8;
                
        _DstType *dst_p = fr_dstImg.getScanline(4) + colRig_i - 4;

        for (int i = 4; i < (int)(h_i - 4); ++i, dst_p+=w_i, ++ptrT_p, ++ptrB_p)
        {
            /// Compute sums.
            sumT_f = *ptrT_p/4 + ptrT_p[ 1]/3 + ptrT_p[ 2]/2 + ptrT_p[ 3];
            sumB_f = *ptrB_p/4 + ptrB_p[-1]/3 + ptrB_p[-2]/2 + ptrB_p[-3];
            
            *dst_p =  (_DstType) ((sumT_f - sumB_f)/norm_f);
        }

    }
    
#if defined ( _OPENMP )
    delete [] sumsVector_p;
#else
    delete [] sums_p;
#endif

    return true;
}
    
