%
% Programmed by Chanwoo Kim for the ASRU 2009
%
% (chanwook@cs.cmu.edu; chanwook@gmail.com)
%
%  SPB-R algorihtm
%
% Usage 
% 
%  szOutFeatFileName : Output feature or wave file name
%  szInFileName      : Input file name
%  bLSMS             : Optinion about whether LSMS will be used
%                      or not. If you don't specify this option,
%                      then, the default is using the LSMS.
%
%                      Setting either 1 or 0 is using LSMS or not
%
% In the source, code, if you turn off the bFeat option, then it
%generates the wavefile instead of doing feature extraction
%
function [aadDCT] = SPB_R(szOutFeatFileName, szInFileName, bLSMS);
	fid = fopen(szInFileName, 'rb');
	fseek(fid, 1024, 'bof');
	ad_x  = fread(fid, 'int16');
	fclose(fid);
    
    if nargin == 2
       bLSMS = 1; 
    end

	bPreem     = 1;
    bSPB       = 1;
    bDisplay   = 0;
    bFeat      = 1;
    bSPB_R      = 1;
    
	dFrameLen    = 0.05;  % 25.6 ms window length, which is the default setting in CMU Sphinx
	dSampRate    = 16000;
	dFramePeriod = dFrameLen / 4;   % 10 ms frame period
    
    iFL        = floor(dFrameLen    * dSampRate);
	iFP        = floor(dFramePeriod * dSampRate);
	iNumFrames = floor((length(ad_x) - iFL) / iFP) + 1;   
    iSpeechLen = length(ad_x);
    
    iFFTSize = 2^ceil(log2(iFL));  
    iNumChan = 40;
	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	%
	% Pre-emphasis using H(z) = 1 - 0.97 z ^ -1
	%
	if (bPreem == 1)
		ad_x = filter([1 -0.97], 1, ad_x);
    end
    
   
    
    aad_P = zeros(iNumChan, iNumFrames);
	i_FI = 0;
    
    ad_X  = zeros(iFFTSize / 2, 1);
    aad_X = zeros(iFFTSize / 2, iNumFrames);
    
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    %
    % Obtaining the spectrum X(e^(jw_k))
    %
    for m = 0 : iFP : iSpeechLen  - iFL 
        ad_x_st            = ad_x(m + 1 : m + iFL) .* hamming(iFL);
        adSpec             = fft(ad_x_st, iFFTSize);
        ad_X               = (adSpec(1: iFFTSize / 2));
        aad_X(:, i_FI + 1) = ad_X; 

        i_FI = i_FI + 1;
    end

   %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   %
   % Log Spectral Mean Subtraction for each frequency index
   %
   if bLSMS == 1
           LSMS_aad_X_tilde = zeros(size(aad_X));
           LSMS_i_L         = floor(dSampRate * 1.5 / iFP);  % The moving average length is 3 second
           LSMS_aad_LogX    = log(max(abs(aad_X), eps));
  
           for j = 1 : iNumFrames
                for i = 1 : iFFTSize / 2
                    LSMS_aad_X_tilde(i, j) =  aad_X(i, j) ./ ...
                          exp(mean(LSMS_aad_LogX(i , max(j - LSMS_i_L, 1) : min(j + LSMS_i_L, iNumFrames))));
                  end
            end
  
         aad_X = LSMS_aad_X_tilde;
   end
   
   if bSPB_R == 1
        dFrameLen_SPB    = dFrameLen;
        dFramePeriod_SPB = dFramePeriod;
        iSpeechLen       = length(ad_x);
        
        SPB_d_alpha      = 0.02;
        
        SPB_iFL        = floor(dFrameLen_SPB    * dSampRate);
        SPB_iFP        = floor(dFramePeriod_SPB * dSampRate);
        SPB_iNumFrames = floor((length(ad_x) - SPB_iFP) / SPB_iFP) + 1;
        
        SPB_iNumChan  = 40;
        SPB_aad_P     = zeros(SPB_iNumChan, SPB_iNumFrames);
        SPB_iFFTSize  = 2^ceil(log2(SPB_iFL));
        
        %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
        %
        % Obtaning the gammatone coefficient. 
        %
        % Based on M. Snelly's auditory toolbox. 
        % In actual C-implementation, we just use a table
        %
        SPB_aad_H = ComputeFilterResponse(SPB_iNumChan, SPB_iFFTSize);
        SPB_aad_H = abs(NormalizeFilterGain(SPB_aad_H));
        SPB_aad_X = aad_X;
        
        %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
        %
        % Frame-by-frame processing
        %
        iFI = 0;
        for m = 0 : SPB_iFP : iSpeechLen - SPB_iFL,
          iFI = iFI + 1;

          adHalfSpec          = abs(aad_X(1 : SPB_iFFTSize / 2, iFI));
          for i = 1 : iNumChan,
              SPB_aad_P(i, iFI) = (sum(((adHalfSpec .* SPB_aad_H(:, i)) .^ 2)));
          end
        end
      
        adSorted      = sort(SPB_aad_P(:));
        SPB_d_P_peak  = adSorted(round(0.95 * length(adSorted)));
        SPB_aad_w     = (sqrt((SPB_aad_P.^2 + (SPB_d_alpha *  SPB_d_P_peak ) .^ 2)) ./ max(SPB_aad_P, eps));
        % 
        % Resynthesis
        %
        %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

        iPowerFactor = 1;

        ad_sumH = sum((SPB_aad_H .^ iPowerFactor )')';
        aad_mu_g    = zeros(SPB_iFFTSize / 2, SPB_iNumFrames);
        ad_mu_g_sym = zeros(SPB_iFFTSize, 1);

        ad_y = zeros(size(ad_x))';
        %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
        %
        % Resynthesis using OLA
        %
        iFI = 0;
        
        for i = 0 : SPB_iFP : iSpeechLen - 1 -  SPB_iFL,
           iFI = iFI + 1;

             for j = 1 : iNumChan,
                  aad_mu_g(:, iFI) = aad_mu_g(:, iFI) + (SPB_aad_w(j, iFI)) .^ (iPowerFactor / 2) .* (SPB_aad_H(:, j)) .^ iPowerFactor;
              end
            aad_mu_g(:, iFI) = ((aad_mu_g(:, iFI)) ./ ad_sumH) .^ (1 / iPowerFactor);
            SPB_aad_X( :, iFI) = (SPB_aad_X( :, iFI) .*  aad_mu_g(:, iFI));

           adBuffer = (ifft([SPB_aad_X( :, iFI); flipud(SPB_aad_X( :, iFI))])');
           ad_y(i + 1 : i + SPB_iFL) = ad_y(i + 1 : i + SPB_iFL) + adBuffer(1 : SPB_iFL);
        end
      %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
      %
      % Selecting only the real part
      %
      ad_y(1 : SPB_iFL) = [];
      iLen = length(ad_y);
      ad_y(iLen - SPB_iFL : iLen) = [];
      ad_y = (SPB_iFP) * real(ad_y);  % resynthesizing based on the OLA constraint

       clear ad_x;
       ad_x = ad_y';
       iSpeechLen = length(ad_x);        
    end
    
    if bFeat == 1
    
        aadFeat =  FE_FeatExt(ad_x);

        %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
        %
        % Writing the feature in Sphinx format
        %
        [iM, iN] = size(aadFeat);
        iNumData = iM * iN;
        fid      = fopen(szOutFeatFileName, 'wb');
        fwrite(fid, iNumData, 'int32');
        iCount = fwrite(fid, aadFeat(:), 'float32');
        fclose(fid);

        %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
        %
        % Display
        %
        if bDisplay == 1
            figure
            aadSpec = idct(aadFeat, iNumChan);
            imagesc(aadSpec); axis xy;
        end
    else 
        wavwrite(ad_x / 32768, 16000, 16, szOutFeatFileName);
    end
  
end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Feature Extraction Part
%
% Programmed by Chanwoo Kim
%         (chanwook@cs.cmu.edu; chanwcom@gmail.com)
%
% Basically the same as MFCC but using the gammatone weighting.
%
% Pre-emphasis is disabled since it was already done in the SPB stage
%
function [FE_aadDCT] = FE_FeatExt(ad_x)
  
    FE_bPreem     = 0; % Pre-emphasis does not need to be done, since it was already done in the SPB-R part
    FE_bDisplay   = 0;
    
	FE_dFrameLen    = 0.0256;  % 25.6 ms window length, which is the default setting in CMU Sphinx
	FE_dSampRate    = 16000;
	FE_dFramePeriod = 0.010;   % 10 ms frame period
	
    FE_iFL        = floor(FE_dFrameLen    * FE_dSampRate);
	FE_iFP        = floor(FE_dFramePeriod * FE_dSampRate);
	FE_iNumFrames = floor((length(ad_x) - FE_iFL) / FE_iFP) + 1;   
    FE_iSpeechLen = length(ad_x);
    
    FE_iFFTSize = 1024;
    FE_iFFTSize = 2^ceil(log2(FE_iFL));  
    FE_iNumChan = 40;
      
    FE_aad_H = ComputeFilterResponse(FE_iNumChan, FE_iFFTSize);
	FE_aad_H = abs(NormalizeFilterGain(FE_aad_H ));
	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	%
	% Pre-emphasis using H(z) = 1 - 0.97 z ^ -1
	%
	if (FE_bPreem == 1)
		ad_x = filter([1 -0.97], 1, ad_x);
    end
    
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	%
	% Obtaning the gammatone coefficient. 
	%
    % Based on M. Snelly's auditory toolbox. 
    % In actual C-implementation, we just use a table
    %
    FE_aad_P = zeros(FE_iNumChan, FE_iNumFrames);

    ad_X      = zeros(FE_iFFTSize    , 1);
    ad_Half_X = zeros(FE_iFFTSize / 2, 1);
    
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    %
    % Obtaining the short-time Power P(i, j)
    %
    FE_i_FI = 0;
    for m = 0 : FE_iFP : FE_iSpeechLen  - FE_iFL
      %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
      %
      % Calculating the Power P(i, j)
      %
      for j = 1 : FE_iNumChan
          %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
          %
          % Squared integration
          %
          ad_x_Frame               = ad_x(m + 1 : m + FE_iFL) .* hamming(FE_iFL);
          ad_X                     = abs(fft(ad_x_Frame, FE_iFFTSize));
          ad_Half_X                = ad_X(1 : FE_iFFTSize / 2);
          FE_aad_P( j , FE_i_FI + 1)  = sum((ad_Half_X .* FE_aad_H(:, j)) .^ 2);
      end
      FE_i_FI = FE_i_FI + 1;
    end
    
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    %
    % Apply the nonlinearity
    %
    FE_aadSpec = log(max(FE_aad_P, eps));
    
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    %
    % DCT
    %
    FE_aadDCT                    = dct(FE_aadSpec);
    FE_aadDCT(14:FE_iNumChan, :) = [];
    
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    %
    % CMN
    %
    for i = 1 : 13
           FE_aadDCT(i, : ) = FE_aadDCT(i, : ) - mean(FE_aadDCT(i, : ));
    end

end
