/**
 * The AVENUE Project
 * Language Technologies Institute
 * School of Computer Science
 * (c) 2007 Carnegie Mellon University
 * 
 * Corpus Navigator
 * Written by Jonathan Clark
 */
package edu.cmu.cs.lti.avenue.navigation.tools;

import info.jonclark.util.ArrayUtils;
import info.jonclark.util.HashUtils;
import info.jonclark.util.StringUtils;

import java.io.File;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.TreeMap;
import java.util.Map.Entry;

import edu.cmu.cs.lti.avenue.corpus.Corpus;
import edu.cmu.cs.lti.avenue.corpus.SentencePair;
import edu.cmu.cs.lti.avenue.corpus.Serializer;

public class TransliteratedAndScriptVersionAligner {

	public static void main(String[] args) throws Exception {

		if (args.length < 3 || args.length > 6) {
			System.err.println("Usage: program <translit_file> <script_file> <both_out_file>"
					+ " [--replace-target] [--ignore-errors] [--transliteration-in-context] [--script-one-per-line]");
			System.exit(1);
		}

		String encoding = "UTF-8";

		boolean replaceTarget = ArrayUtils.unsortedArrayContains(args, "--replace-target");
		boolean ignoreErrors = ArrayUtils.unsortedArrayContains(args, "--ignore-errors");
		boolean translitInContext =
				ArrayUtils.unsortedArrayContains(args, "--transliteration-in-context");
		boolean scriptOnePerLine = ArrayUtils.unsortedArrayContains(args, "--script-one-per-line");

		// sanity checking
		TreeMap<String, HashSet<String>> transliterationLexicon =
				new TreeMap<String, HashSet<String>>();

		Corpus transliteratedPairs =
				Serializer.loadSentencePairs(new File(args[0]), true, encoding);
		Corpus scriptPairs = Serializer.loadSentencePairs(new File(args[1]), encoding);
		File outFile = new File(args[2]);

		assert transliteratedPairs.getSentences().size() == scriptPairs.getSentences().size() : "Mismatched number of transliterated and script pairs: "
				+ transliteratedPairs.getSentences().size()
				+ " transliterated vs "
				+ scriptPairs.getSentences().size() + " script";

		int nErrors = 0;

		for (int i = 0; i < transliteratedPairs.getSentences().size(); i++) {
			SentencePair trans = transliteratedPairs.getSentences().get(i);
			SentencePair script = scriptPairs.getSentences().get(i);

			if (trans.getDisplayTargetTokens().length != script.getDisplayTargetTokens().length) {
				String message =
						"Translated sentences have length mismatch: "
								+ trans.getMyLine()
								+ " and "
								+ script.getMyLine()
								+ " (Pair "
								+ (i + 1)
								+ ")\n"
								+ trans.getDisplayTargetSentence()
								+ "\n"
								+ script.getDisplayTargetSentence()
								+ "\n"
								+ "Comment: "
								+ trans.getComment()
								+ "\n"
								+ findMissing(trans.getDisplayTargetTokens(),
										script.getDisplayTargetTokens(), transliterationLexicon);
				if (ignoreErrors) {
					System.err.println(r("WARNING: " + message));
					nErrors++;
				} else {
					throw new RuntimeException(message);
				}
			} else {

				// check for Roman garbage
				String strScript = script.getDisplayTargetSentence();
				if (UtfUtils.containsRomanLetters(strScript)) {
					System.err.println(r("WARNING: Script sentence contains Roman letters: "
							+ strScript));
				}

				// check sanity of transliteration
				String message = "";
				for (int j = 0; j < trans.getDisplayTargetTokens().length; j++) {
					String transLex = trans.getDisplayTargetTokens()[j];
					HashSet<String> prevLex = transliterationLexicon.get(transLex);
					String scriptLex = script.getDisplayTargetTokens()[j];

					if (prevLex != null && !prevLex.contains(scriptLex)) {
						message =
								"Transliteration ambiguity detected for word: \"" + transLex
										+ "\" = \"" + StringUtils.untokenize(prevLex, " // ")
										+ "\" and \"" + scriptLex + "\"";
						if (ignoreErrors) {
							// System.err.println(r("WARNING: " + message));
							// nErrors++;
						} else {
							// throw new RuntimeException(message);
						}
						message =
								" !!!" + message + "(id=" + trans.getId() + ") "
										+ trans.getDisplaySourceSentence()
										+ " on script file line " + script.getMyLine() + "\n";
					}
					HashUtils.put(transliterationLexicon, transLex, scriptLex);
				}

				if (translitInContext) {
					trans.setContext(trans.getDisplayTargetSentence());
				}

				if (replaceTarget) {
					trans.setDisplayTargetSentence(script.getDisplayTargetSentence());
					trans.setComment(UtfUtils.removeUnicodeChars(trans.getComment().trim()));
				} else {
					String prevComment = trans.getComment().trim();
					prevComment = UtfUtils.removeUnicodeChars(prevComment);
					trans.setComment(prevComment + " " + script.getDisplayTargetSentence());
				}

				trans.setComment(trans.getComment() + message);
			}

			if (!trans.getDisplaySourceSentence().equals(script.getDisplaySourceSentence())) {
				String message =
						"Translated sentences have a source mismatch: " + trans.getMyLine()
								+ " and " + script.getMyLine() + " (Pair " + (i + 1) + ")\n"
								+ trans.getDisplaySourceSentence() + "\n"
								+ script.getDisplaySourceSentence();
				if (ignoreErrors) {
					System.err.println(r("WARNING: " + message));
					nErrors++;
				} else {
					throw new RuntimeException(message);
				}
			}
		}

		Serializer.saveSentencePairs(transliteratedPairs, outFile);

		for (final Entry<String, HashSet<String>> entry : transliterationLexicon.entrySet()) {
			;
			System.out.println(r(entry.getKey() + " --> "
					+ StringUtils.untokenize(entry.getValue(), " // ")));
		}

		System.err.println(nErrors + " errors encountered.");
	}

	private static ArrayList<String> findMissing(String[] arrTrans, String[] arrScript,
			TreeMap<String, HashSet<String>> transLexicon) {

		ArrayList<String> result = new ArrayList<String>();
		HashSet<String> hsScript = new HashSet<String>(Arrays.asList(arrScript));
		for (final String strTrans : arrTrans) {

			HashSet<String> transEntries = transLexicon.get(strTrans);
			boolean found = false;
			if (transEntries != null) {
				for (final String expectedScript : transEntries) {
					if (hsScript.contains(expectedScript)) {
						found = true;
					}
				}
				if (!found) {
					result.add(strTrans + " --> " + StringUtils.untokenize(transEntries, " // "));
				}
			}
		}

		return result;
	}

	private static String r(String s) throws UnsupportedEncodingException {
		String reencoded = new String(s.getBytes("UTF-8"));
		return reencoded;
	}
}
