package cmonson.morphologyChallengeUtilities;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import cmonson.util.FileUtils;

/**
 * This class does simple processing on a Turkish wordlist supplied by Kemal Oflazer
 * to convert it into a wordlist that can be used as input to Morfessor.
 * 
 * @author cmonson
 *
 */
public class Turkish_extractMorfessorStyleInputWordlist {

	BufferedReader turkishWordlistWithFreqInfoReader = null;
	PrintWriter turkishWordlistWriter = null;
	
	/*
	 * This is a magic number. Kemal's Turkish files don't list the frequency of each
	 * surface form, but rather:
	 * 
	 *   frequency of surface form
	 *  --------------------------- * 100  ===  the percent of the tokens in the corpus 
	 *    total tokens in corpus                that are a given surface form
	 *    
	 * So we (approximately) reconstruct the frequency information by dividing the 
	 * listed MLE probability estimate by the probability assigned to a surface form
	 * that only occurs once.
	 */
	public static final double PERCENT_OF_A_FORM_THAT_OCCURED_ONCE = 0.00000847732852929;
	
	public 
	Turkish_extractMorfessorStyleInputWordlist(
			File morfessorStyleAnalyses, 
			File processedMorfessorFile) {
		
		turkishWordlistWithFreqInfoReader = 
			FileUtils.openFileForReading(morfessorStyleAnalyses, "UTF-8"); 
		
		/*
		 * Although Morfessor can't handle UTF-8, Java can't handle latin-5,
		 * so we write things out as UTF-8 and then use emacs to convert to
		 * latin-5 for input to Morfessor.
		 */
		turkishWordlistWriter = 
			FileUtils.openFileForWriting(processedMorfessorFile, "UTF-8"); 
	}
	


	/**
	 * @param args the command line arguments which must look like:
	 *
	 * <morfessor-analysis-file> <morphoChallenge-style-output-file>
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		if (args.length != 2) {
			System.out.println("The command line must look like:");
			System.out.println(
			 "    java Turkish_extractMorfessorStyleInputWordlist " + String.format("%n") +
			 "        <path-to-turkish-wordlist-file-that-also-contains-freq-info> " + String.format("%n") +
			 "        <path-to-resulting-MorfessorStyleInput-wordlist-file>" + String.format("%n%n"));
			System.out.println("    Exiting...");
			System.out.println();
			System.out.println();
			System.exit(0);
		}
		
		Turkish_extractMorfessorStyleInputWordlist converter = 
			new Turkish_extractMorfessorStyleInputWordlist(
					new File(args[0]), 
					new File(args[1]));
		
		converter.convert();
	}

	public void convert() throws IOException {
		
		int lineCounter = 0;
		String lineFromTurkishWordlistFileContainingFreqInfo;
		while ((lineFromTurkishWordlistFileContainingFreqInfo = 
					turkishWordlistWithFreqInfoReader.readLine()) != null) {
			
			// skip blank lines
			if (lineFromTurkishWordlistFileContainingFreqInfo.matches("^\\s*$")) {
				continue;
			}
			
			lineCounter++;
			if ((lineCounter%1000) == 0) {
				System.err.println("  " + lineCounter + " " + lineFromTurkishWordlistFileContainingFreqInfo);
				System.err.flush();
			}
			
			// Each line is one word followed by a white space separated series of 
			// numbers representing frequency data. I was not informed what the second
			// column of numbers is. The third column is the word frequency as a 
			// percent of the total tokens. The fourth column is the
			// cumulative percent.
			//		
			// hastadan	0	0.00022888787029077	94.8672488500218
			// defterleri	0	0.00022888787029077	94.8674777378921
			// düşüyordu	0	0.00022888787029077	94.8677066257624
			// heyecanlıyım	0	0.00022888787029077	94.8679355136327
			// deşarj	0	0.00022888787029077	94.868164401503
			// köşkte	0	0.00022888787029077	94.8683932893733

			Pattern turkishFreqPattern = Pattern.compile("^\\s*(\\S+)\\s+\\S+\\s+(\\S+).*$");
			Matcher turkishFreqMatcher = 
				turkishFreqPattern.matcher(lineFromTurkishWordlistFileContainingFreqInfo);
			boolean matches = turkishFreqMatcher.matches();
			if ( ! matches) {
				System.err.println();
				System.err.println("BADLY FORMATTED Turkish frequency file");
				System.err.println();
				System.err.println("Line: " + lineFromTurkishWordlistFileContainingFreqInfo);
				System.err.println();
			}
			String turkishWordform = turkishFreqMatcher.group(1);
			String turkishWordformFreqAsPercent_string = turkishFreqMatcher.group(2);
			Double turkishWordformFreqAsPercent = 
				Double.valueOf(turkishWordformFreqAsPercent_string);
			Double turkishWordformFreq_double = 
					turkishWordformFreqAsPercent / 
					PERCENT_OF_A_FORM_THAT_OCCURED_ONCE;
			long turkishWordformFreq = Math.round(turkishWordformFreq_double);
			
			turkishWordlistWriter.print(turkishWordformFreq + " " + turkishWordform + "\n");
		}
		turkishWordlistWriter.flush();
	}

}
