%Minimum Error Rate Training For Statistical Machine Translation %Copyright (2005) Ashish Venugopal %This program is free software; you can redistribute it and/or %modify it under the terms of the GNU General Public License %as published by the Free Software Foundation; either version 2 %of the License, or (at your option) any later version. %This program is distributed in the hope that it will be useful, %but WITHOUT ANY WARRANTY; without even the implied warranty of % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the % GNU General Public License for more details. % You should have received a copy of the GNU General Public % License along with this program; if not, write to the Free % Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA ALLDATA = load(feats_opt); CANDS = load(cands_opt); %init opt is now going to have ranges for each parameter INIT = load(init_opt); disp('Loaded translation data'); startEndIndices = zeros(size(CANDS,1), 2); startIndex=1; count = 0; for s=1:1:size(CANDS,1) endIndex = startIndex + CANDS(s, 2)-1; count = count + 1; startEndIndices(count, :) = [startIndex, endIndex]; startIndex=endIndex+1; end [nlines, NF] = size(INIT); ALLFEAT = ALLDATA(:, 1:NF); ALLSCORES = ALLDATA(:, NF+1:size(ALLDATA,2)); %clear ALLDATA; %these variables are read in from the parameter restriction file leftBound = INIT(1, :) rightBound = INIT(2, :) startingParams = INIT(3, :); %when a lambda is within ExpansionMargin percentage ratio %of either (symetric) bound, then multiply the bound by %the ExpansionFactor ExpansionMargin = 0.0; ExpansionFactor = 1.2; %number of times lambdas are randomly initialized for optimization NumRandomTests = 5; PermutationsEpsilon = 0.001; %nist stats ratio_x = 1.5; score_x = 0.5; beta = -log(score_x)/log(ratio_x)/log(ratio_x); MAXNIST = 20.0; %generate initial sums initialScore = zeros(1, size(ALLSCORES,2)); numSentences = size(startEndIndices,1); for s=1:numSentences sentStart = startEndIndices(s, 1); initialScore = initialScore+ALLSCORES(sentStart, :);; end numgrams = (size(initialScore,2)-1)/2.0; referenceLength = initialScore(1, length(initialScore)); ngramData = initialScore(1, 1:length(initialScore)-1); ngramData = reshape(ngramData, 2, numgrams); basePrecision = ngramData(1,:)./ngramData(2,:); %ngramData(2,1) is the number of unigrams suggested lratio = ngramData(2,1)/referenceLength; lpen = 1.0; if lratio>=1.0 %do nothing else lpen = exp (-beta*log(lratio)*log(lratio)); end bleuScore = (lpen*sum(basePrecision)); newError = MAXNIST-bleuScore; str = 'InitialScore:'; str = sprintf('%s %f\n',str,bleuScore); disp(str); %saving the full optimization data after each complete iteration %FullResults = zeros(NumRandomTests, NF+1); FullResults = []; %top level randomization of start values for (topLevel=1:NumRandomTests) %generate random values in the restricted ranges, but dont %initialize the sampled values right next to the bounds, since then %the Expansion will always kick in. This way if the optimization %finds better parameters in the margin, only then will be have to %expand the range % if there are three rows in INIT and this is the first % iteration, then use the value from the last full run if (topLevel==1 && nlines==3) LAMBDAS = startingParams; else LAMBDAS = (leftBound+ExpansionMargin) + ((rightBound-ExpansionMargin)-(leftBound+ExpansionMargin)).*rand(1, NF); end str = 'Start Values:'; disp(str); str = ''; for i=1:length(LAMBDAS) str = sprintf('%s %f',str,LAMBDAS(i)); end disp(str); %lastValue hold initial degenerate values of lambda lastValue = ones(1,size(LAMBDAS,2))*-1.0; %holds the degerate value for the error savedError = MAXNIST; lastError = MAXNIST; numIter = 1; ConvergedLimit = 3; converged = 0; IterationLimit = 100; %generate a random permutation of the dimension indices to vary %the search order, acrosss random initializations %ARV 06/20 %oRange = randperm(NF); oRange = 1:NF; while (numIter= slopeOfRightCandidate); %initialize data that must be saved for this iteration %numIntersects starts at 1 since the first intersection point=leftBound numIntersects = 1; intersectPoints = zeros(numCands+1, 2); intersectPoints(1, :) = [leftBound(optiDim) minLeftValue]; %initialize the errorDelta that comes from crosssing the leftBound errorDeltas = zeros(numCands+1, size(SCORES,2)+1); errorDeltas(1, :) = [intersectPoints(1, 1) SCORES(minLeftCandidate, :)]; %used to create error deltas across intersection boundaries workingError = SCORES(minLeftCandidate, :); %the relevant indices determine which candidate to consider for %evaluation. currently those sentences which have a slope lower %than the current sentence are chosen, becuase these %sentences have a chance of intersecting the current %sentence and becoming the lowest cost translation while (length(relevantIndices)>0) bestL = rightBound(optiDim); bestValue = minRightValue; bestSlope = slopeOfLeftCandidate; %to save the new index of the min candidate newMinCandidate = minLeftCandidate; for c=1:length(relevantIndices) x = relevantIndices(c); top = a(x,1)-a(minLeftCandidate,1); bottom = slopes(minLeftCandidate,1)-slopes(x,1); if (bottom~=0) newL = top/bottom; if (newL < bestL) candidateSlope = slopes(x); bestL = newL; bestValue = a(x)+ (newL*candidateSlope); newMinCandidate = c; bestSlope = candidateSlope; end %allows new intersections to be proposed that intersect at the %same lambda. In this case consider only the one with the %steeper slope since it will be lower in the next region if (newL == bestL & (slopes(x)= slopeOfRightCandidate); end %ends the search for all relevant indices within this sentence %prunes the lists down to their actual lengths intersectPoints = intersectPoints(1:numIntersects, :); errorDeltas = errorDeltas(1:numIntersects, :); totalErrorDelta = [totalErrorDelta; errorDeltas]; end %end of search thru each sentence totalErrorDelta = sortrows(totalErrorDelta); %generate bleu scores with length penalty for these boundaries lenTotalErrorDelta = length(totalErrorDelta); errorLineLen = size(totalErrorDelta, 2); numgrams = (size(ALLSCORES,2)-1)/2.0; % a 0 count line vector to compare against zeroLine = zeros(1, errorLineLen-1); totalErrorDelta = [totalErrorDelta; [rightBound(optiDim), zeroLine]]; totalBaseError = zeroLine; %set the lowest error to the max possible error of this metric lowestError = MAXNIST; lowestLambda = leftBound(optiDim); for bIndex=1:lenTotalErrorDelta %increment the base count data incrementalError = totalErrorDelta(bIndex,2:errorLineLen); %no need to recalculate the error if there is no change in error count if (any(incrementalError)) totalBaseError = totalBaseError + incrementalError; %if there are several boundary values that are the same we want to %only consider a difference in score once they have all been %accounted for if (totalErrorDelta(bIndex, 1)~=totalErrorDelta(bIndex+1, 1)) %calculate the new bleu score referenceLength = totalBaseError(1, length(totalBaseError)); ngramData = totalBaseError(1, 1:length(totalBaseError)-1); ngramData = reshape(ngramData, 2, numgrams); basePrecision = ngramData(1,:)./ngramData(2,:); %ngramData(2,1) is the number of unigrams suggested lratio = ngramData(2,1)/referenceLength; lpen = 1.0; if lratio>=1.0 %do nothing else lpen = exp (-beta*log(lratio)*log(lratio)); end newError = MAXNIST-(lpen*sum(basePrecision)); %see if an improvement has been made if (newError < lowestError) lowestError = newError; lowestLambda = (totalErrorDelta(bIndex, 1)+totalErrorDelta(bIndex+1, 1))/2.0; end %use this to plot an error surface if you want %errorSurface = [errorSurface; [totalErrorDelta(bIndex, 1) totalErrorDelta(bIndex+1, 1) ((totalErrorDelta(bIndex, 1)+totalErrorDelta(bIndex+1, 1))/2.0) newError]]; end end end %replace lambda with the old value LAMBDAS(1, optiDim) = savedLambda; %save the error values in the errorGain table errorGain(1, optiDim) = lowestLambda; errorGain(2, optiDim) = lowestError; %allows you to jump out and do temporarily worse for a bit savedError=lowestError; %dont lock in the error anymore %LAMBDAS(1,optiDim)=lowestLambda; %null out a parameter if it doesnt changes at all %for i=1:length(LAMBDAS) % if var(ALLFEAT(:,i))==0 % LAMBDAS(i)=0; % end % end %display the lowest error if we had used the value of lambda %str = ''; %str = sprintf('Dim %f', optiDim); %LAMBDAS_COPY = LAMBDAS; %LAMBDAS_COPY(1, optiDim) = lowestLambda; %for i=1:length(LAMBDAS_COPY) % str = sprintf('%s %f',str,LAMBDAS_COPY(i)); %end %str = sprintf('%s %f', str, lowestError); %disp(str); %at this point we have gone thru all dimensions end %disp('Iteration Completed, picking optimal lambda by gain'); %errorGain [minError, minErrorDim] = min(errorGain(2, :)); %minError %minErrorDim if (minError <= savedError) LAMBDAS(1, minErrorDim) = errorGain(1, minErrorDim); str = ''; str = sprintf('UsedDim(%f)', minErrorDim); for i=1:length(LAMBDAS) if (i==minErrorDim) str = sprintf('%s <%f>',str,LAMBDAS(i)); else str = sprintf('%s %f',str,LAMBDAS(i)); end end str = sprintf('%s %f', str, minError); disp(str); savedError = minError; end % a mini convergence has happenned, lets save its value % in case the kickout cannot return back to the same value % this kickout move can only be done a limited number of % times, ie the converged limit if (abs(savedError-lastError)