package cmonson.morphologyChallengeUtilities;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;

import cmonson.util.FileUtils;

/**
 * This class does simple processing on a file of Turkish text. Standard Turkish
 * orthography has a number of funky characters. Since Morfessor (Creutz, 2006) 
 * was written in PERL and Makefiles, which can't handle UTF-8, Morpho Challenge
 * 2007 invented an ASCII transliteration of Turkish orthography. This script 
 * here converts to and from this invented ASCII transliteration and the 
 * UTF-8 encoding of the unicode for the standard Turkish orthography.
 * 
 * Although the Morpho Challenge 2007 orthography *should* be all ASCII, foreign
 * words could possibly slip in with characters in some 8-bit encoding (most
 * likely latin-1). But since you can never be sure, this script assumes that
 * all input text is in UTF-8.
 * 
 * @author cmonson
 *
 */
public class Turkish_convertBetweenStandardAndMorphoChallenge2007Orthographies {

	private BufferedReader turkishSourceReader = null;
	private PrintWriter turkishTargetWriter = null;
	
	TurkishOrthography inputOrthography = null;
	
	private enum TurkishOrthography {
		STANDARD,
		MORPHO_CHALLENGE_2007
	}
	
	public 
	Turkish_convertBetweenStandardAndMorphoChallenge2007Orthographies(
			TurkishOrthography inputOrthography,
			File turkishInputFile, 
			File turkishOutputFile) {
		
		this.inputOrthography = inputOrthography;
		
		turkishSourceReader = FileUtils.openFileForReading(turkishInputFile, "UTF-8"); 
		
		turkishTargetWriter = FileUtils.openFileForWriting(turkishOutputFile, "UTF-8"); 
	}

	/**
	 * @param args the command line arguments which must look like:
	 *
	 * <morfessor-analysis-file> <morphoChallenge-style-output-file>
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		if (args.length != 4) {
			System.err.println();
			System.err.println("There must be exactly 4 command line arguments");
			System.err.println("  This time there were " + args.length);
			System.err.println();
			usage();
		}
		
		String flag = args[0].toLowerCase();
		
		if ( ! flag.equals("-inputorthography")) {
			System.err.println();
			System.err.println("The first command line argument must be: -inputorthography");
			System.err.println("  This time it was: " + args[0]);
			System.err.println();
			usage();
		}
		
		TurkishOrthography inputOrthography = null;
		String inputOrthographyString = args[1].toLowerCase();
		if (inputOrthographyString.equals("morphochallenge2007")) {
			inputOrthography = TurkishOrthography.MORPHO_CHALLENGE_2007;
		} else if (inputOrthographyString.equals("standard")) {
			inputOrthography = TurkishOrthography.STANDARD;
		} else {
			System.err.println();
			System.err.println("The input orthography must be specified as:");
			System.err.println("  morphochallenge2007 OR standard");
			System.err.println();
			usage();
		}
		
		Turkish_convertBetweenStandardAndMorphoChallenge2007Orthographies converter = 
			new Turkish_convertBetweenStandardAndMorphoChallenge2007Orthographies(
					inputOrthography,
					new File(args[2]), 
					new File(args[3]));
		
		converter.convert();
	}

	private static void usage() {
		System.out.println("The command line must look like:");
		System.out.println(
		 "    java Turkish_convertBetweenStandardAndMorphoChallenge2007Orthographies " + String.format("%n") +
		 "        -inputOrthography [ MorphoChallenge2007 | Standard ]" + String.format("%n") +
		 "        <path-to-inputFile> " + String.format("%n") +
		 "        <path-to-output-file>" + String.format("%n%n"));
		System.out.println("    Exiting...");
		System.out.println();
		System.out.println();
		System.exit(0);
	}

	public void convert() throws IOException {
		
		int lineCounter = 0;
		String lineFromTurkishInputFile;
		while ((lineFromTurkishInputFile = 
				turkishSourceReader.readLine()) != null) {
						
			lineCounter++;
			if ((lineCounter%1000) == 0) {
				System.err.println("  " + lineCounter + " " + lineFromTurkishInputFile);
				System.err.flush();
			}
			
			String lineForTurkishOutputFile = null;
			
			switch (inputOrthography) {
			case MORPHO_CHALLENGE_2007:
				lineForTurkishOutputFile = 
					fromMorphoChallenge2007_to_Standard_orthography(lineFromTurkishInputFile);
				break;
			case STANDARD:
				lineForTurkishOutputFile =
					fromStandard_to_MorphoChallenge2007_orthography(lineFromTurkishInputFile);
				break;
			default:
				System.err.println();
				System.err.println("ERROR: ");
			}

			
			turkishTargetWriter.println(lineForTurkishOutputFile);
		}
		
		turkishTargetWriter.flush();
	}

	
	private String 
	fromMorphoChallenge2007_to_Standard_orthography(
			String morphoChallenge2007OrthographyString) {
		
		String standardOrthographyString = morphoChallenge2007OrthographyString;
		standardOrthographyString = standardOrthographyString.replace('C', 'ç');
		standardOrthographyString = standardOrthographyString.replace('G', 'ğ');
		standardOrthographyString = standardOrthographyString.replace('I', 'ı');
		standardOrthographyString = standardOrthographyString.replace('O', 'ö');
		standardOrthographyString = standardOrthographyString.replace('S', 'ş');
		standardOrthographyString = standardOrthographyString.replace('U', 'ü');
		
		return standardOrthographyString;
	}

	private String 
	fromStandard_to_MorphoChallenge2007_orthography(String standardOrthographyString) {

		String morphoChallenge2007String = standardOrthographyString;
		morphoChallenge2007String = morphoChallenge2007String.replace('ç', 'C');
		morphoChallenge2007String = morphoChallenge2007String.replace('ğ', 'G');
		morphoChallenge2007String = morphoChallenge2007String.replace('ı', 'I');
		morphoChallenge2007String = morphoChallenge2007String.replace('ö', 'O');
		morphoChallenge2007String = morphoChallenge2007String.replace('ş', 'S');
		morphoChallenge2007String = morphoChallenge2007String.replace('ü', 'U');
		
		return morphoChallenge2007String;
	}
}
