package cmonson.morphologyChallengeUtilities;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import cmonson.util.FileUtils;

/**
 * This class does simple processing on a Turkish wordlist supplied by Kemal Oflazer
 * to remove frequency information and obtain a vocabulary list. I needed to write
 * this class instead of just using ParaMor's built in vocabulary builder because
 * I do not yet have a definitive list of all the characters in Turkish. The wordlist
 * Kemal gave me is very clean, and once I have run this class, the list of Turkish
 * characters is simply all the characters in any word in the wordlist.
 * 
 * @author cmonson
 *
 */
public class Turkish_extractBareWordlist {

	BufferedReader turkishWordlistWithFreqInfoReader = null;
	PrintWriter turkishWordlistWriter = null;
	
	public 
	Turkish_extractBareWordlist(
			File morfessorStyleAnalyses, 
			File processedMorfessorFile) {
		
		turkishWordlistWithFreqInfoReader = 
			FileUtils.openFileForReading(morfessorStyleAnalyses, "UTF-8"); 
		
		turkishWordlistWriter = 
			FileUtils.openFileForWriting(processedMorfessorFile, "UTF-8"); 
	}
	


	/**
	 * @param args the command line arguments which must look like:
	 *
	 * <morfessor-analysis-file> <morphoChallenge-style-output-file>
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		if (args.length != 2) {
			System.out.println("The command line must look like:");
			System.out.println(
			 "    java Turkish_extractBareWordlist " + String.format("%n") +
			 "        <path-to-turkish-wordlist-file-that-also-contains-freq-info> " + String.format("%n") +
			 "        <path-to-processed-wordlist-file-output>" + String.format("%n%n"));
			System.out.println("    Exiting...");
			System.out.println();
			System.out.println();
			System.exit(0);
		}
		
		Turkish_extractBareWordlist converter = 
			new Turkish_extractBareWordlist(
					new File(args[0]), 
					new File(args[1]));
		
		converter.convert();
	}

	public void convert() throws IOException {
		
		int lineCounter = 0;
		String lineFromTurkishWordlistFileContainingFreqInfo;
		while ((lineFromTurkishWordlistFileContainingFreqInfo = 
					turkishWordlistWithFreqInfoReader.readLine()) != null) {
			
			// skip blank lines
			if (lineFromTurkishWordlistFileContainingFreqInfo.matches("^\\s*$")) {
				continue;
			}
			
			lineCounter++;
			if ((lineCounter%1000) == 0) {
				System.err.println("  " + lineCounter + " " + lineFromTurkishWordlistFileContainingFreqInfo);
				System.err.flush();
			}
			
			// Each line is one word followed by a white space separated series of 
			// numbers representing frequency data.
			//		
			// hastadan	0	0.00022888787029077	94.8672488500218
			// defterleri	0	0.00022888787029077	94.8674777378921
			// düşüyordu	0	0.00022888787029077	94.8677066257624
			// heyecanlıyım	0	0.00022888787029077	94.8679355136327
			// deşarj	0	0.00022888787029077	94.868164401503
			// köşkte	0	0.00022888787029077	94.8683932893733

			Pattern turkishFreqPattern = Pattern.compile("^\\s*(\\S+).*$");
			Matcher turkishFreqMatcher = 
				turkishFreqPattern.matcher(lineFromTurkishWordlistFileContainingFreqInfo);
			boolean matches = turkishFreqMatcher.matches();
			if ( ! matches) {
				System.err.println();
				System.err.println("BADLY FORMATTED Turkish frequency file");
				System.err.println();
				System.err.println("Line: " + lineFromTurkishWordlistFileContainingFreqInfo);
				System.err.println();
			}
			String turkishWordform = turkishFreqMatcher.group(1);
			
			turkishWordlistWriter.print(turkishWordform + "\n");
		}
		turkishWordlistWriter.flush();
	}

}
