package cmonson.morphologyChallengeUtilities;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import cmonson.util.FileUtils;

/**
 * This class does simple processing on a Morphologically analyzed Turkish wordlist 
 * supplied by Kemal Oflazer to convert the analyses into a MorphoChallenge 2007 style
 * answer key.
 * 
 * @author cmonson
 *
 */
public class Turkish_kemalsMorphologicallyAnalysesToMorphoChallenge2007Style {

	BufferedReader turkishAanalysesFromKemalReader = null;
	PrintWriter morphoChallenge2007AnswerKeyWriter = null;
	
	public 
	Turkish_kemalsMorphologicallyAnalysesToMorphoChallenge2007Style(
			File turkishAnalysesFromKemalFile, 
			File morphoChallenge2007AnswerKeyFile) {
		
		turkishAanalysesFromKemalReader = 
			FileUtils.openFileForReading(turkishAnalysesFromKemalFile, "UTF-8"); 
		
		morphoChallenge2007AnswerKeyWriter = 
			FileUtils.openFileForWriting(morphoChallenge2007AnswerKeyFile, "UTF-8"); 
	}

	/**
	 * @param args the command line arguments which must look like:
	 *
	 * <morfessor-analysis-file> <morphoChallenge-style-output-file>
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		if (args.length != 2) {
			System.out.println("The command line must look like:");
			System.out.println(
			 "    java Turkish_kemalsAnalysesToMorphoChallenge2007Style " + String.format("%n") +
			 "        <path-to-kemal's-morphologically-analyzed-turkish-words> " + String.format("%n") +
			 "        <path-to-morpho-challenge-2007-style-output-file>" + String.format("%n%n"));
			System.out.println("    Exiting...");
			System.out.println();
			System.out.println();
			System.exit(0);
		}
		
		Turkish_kemalsMorphologicallyAnalysesToMorphoChallenge2007Style converter = 
			new Turkish_kemalsMorphologicallyAnalysesToMorphoChallenge2007Style(
					new File(args[0]), 
					new File(args[1]));
		
		converter.convert();
	}

	public void convert() throws IOException {
		
		String currentTurkishWordForm = "";
		
		int lineCounter = 0;
		String lineFromTurkishAnalysesFromKemal;
		while ((lineFromTurkishAnalysesFromKemal = 
					turkishAanalysesFromKemalReader.readLine()) != null) {
			
			// skip blank lines
			if (lineFromTurkishAnalysesFromKemal.matches("^\\s*$")) {
				continue;
			}
			
			lineCounter++;
			if ((lineCounter%1000) == 0) {
				System.err.println("  " + lineCounter + " " + lineFromTurkishAnalysesFromKemal);
				System.err.flush();
			}
			
			// Each line is a wordform followed by a single analysis.
			// If a wordform has more than one analysis, then there is
			// more than one line for that wordform.
			//
			// ve	ve
			//
			// bir	bir
			//
			// bu	bu
			//
			// da	da
			//
			// için	iç	+Imp+A2pl
			// için	iç	+P2sg
			// için	iç	+Gen
			// için	için
			//
			// de	de
			//
			// çok	çok
			//
			// 
			// The Morpho Challenge 2007 style analyses on the other hand have a single
			// line for each wordform, and multiple analyses are comma separated.  Additionally,
			// I need to space seaparate sequences of multiple feature such as:
			// 
			//   +Imp+A2pl --> +Imp +A2pl
			//
			// ve	ve 
			// bir	bir
			// bu	bu
			// da	da
			// için	iç +Imp +A2pl, iç +P2sg, iç +Gen, için
			// de	de
			// çok	çok
			//
			// Note that the wordform is separated from its analysis by a TAB character
			
			Pattern kemalStyleAnalysisPattern = Pattern.compile("^\\s*(\\S+)\\s+(.*)$");
			Matcher kemalStyleAnalysisMatcher = 
				kemalStyleAnalysisPattern.matcher(lineFromTurkishAnalysesFromKemal);
			boolean matches = kemalStyleAnalysisMatcher.matches();
			if ( ! matches) {
				System.err.println();
				System.err.println("BADLY FORMATTED Kemal Style Analysis File");
				System.err.println();
				System.err.println("Line: " + lineFromTurkishAnalysesFromKemal);
				System.err.println();
			}
			String newTurkishWordform = kemalStyleAnalysisMatcher.group(1);
			String analysis = kemalStyleAnalysisMatcher.group(2);
			
			if (newTurkishWordform.equals(currentTurkishWordForm)) {
				morphoChallenge2007AnswerKeyWriter.print(", ");
			} else {
				currentTurkishWordForm = newTurkishWordform;
				morphoChallenge2007AnswerKeyWriter.print("\n");
				morphoChallenge2007AnswerKeyWriter.print(newTurkishWordform + "\t");
			}
			
			// We are now all ready to write out this analysis of the current
			// wordform. But first we need to minorly modify the analysis to 
			// conform to the Morpho Challenge 2007 styles.
			
			// Replace each instance of optional whitespace followed by a '+'
			// with a space followed by a '+'
			analysis = analysis.replaceAll("\\s*\\+", " +");
			
			morphoChallenge2007AnswerKeyWriter.print(analysis);
		}
		
		morphoChallenge2007AnswerKeyWriter.flush();
	}

}
