/**
 * The AVENUE Project
 * Language Technologies Institute
 * School of Computer Science
 * (c) 2007 Carnegie Mellon University
 * 
 * Corpus Navigator
 * Written by Jonathan Clark
 */
package edu.cmu.cs.lti.avenue.navigation.tools;

import info.jonclark.corpus.tokenize.EnglishTokenizer;
import info.jonclark.properties.SmartProperties;
import info.jonclark.util.ArrayUtils;
import info.jonclark.util.FileUtils;
import info.jonclark.util.FormatUtils;
import info.jonclark.util.StringUtils;
import it.unimi.dsi.fastutil.objects.Object2ObjectArrayMap;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map.Entry;

import edu.cmu.cs.lti.avenue.corpus.Alignment;
import edu.cmu.cs.lti.avenue.corpus.Corpus;
import edu.cmu.cs.lti.avenue.corpus.SentencePair;
import edu.cmu.cs.lti.avenue.corpus.Serializer;
import edu.cmu.cs.lti.avenue.trees.smart.TreeNode;
import edu.cmu.cs.lti.avenue.trees.smart.SmartTree.LabelDisplay;

/**
 * This program takes the output of the Elicitation Tool (which contains the
 * source sentences, target sentences, and alignments) and the output of the
 * PhiPlusMapper (which contains source sentences, f-structures, c-structures,
 * the phi mapping, and a head mapping) and combines them into a single file.
 * All names are replaced with the names given in the EC from the ET.
 */
public class ElicitedCorpusAligner {

	private static String[] maleNames;
	private static String[] femaleNames;

	public static String undoMeaningFixes(Object2ObjectArrayMap<String, String> meaningFixes,
			String sentence) {
		for (Entry<String, String> entry : meaningFixes.entrySet())
			sentence = StringUtils.replaceFast(sentence, entry.getKey(), entry.getValue());
		return sentence;
	}

	public static String applyMeaningFixes(Object2ObjectArrayMap<String, String> meaningFixes,
			String sentence) {
		for (Entry<String, String> entry : meaningFixes.entrySet())
			sentence = StringUtils.replaceFast(sentence, entry.getValue(), entry.getKey());
		return sentence;
	}

	private static String replaceNames(String sentence) {

		String inSentence = sentence;
		for (String femaleName : femaleNames)
			sentence =
					StringUtils.replaceFast(sentence, femaleName.toLowerCase(), "any_female_name");
		for (String maleName : maleNames)
			sentence = StringUtils.replaceFast(sentence, maleName.toLowerCase(), "any_male_name");

		if (inSentence.equals(sentence) == false) {
			return replaceNames(sentence);
		} else {
			return sentence;
		}
	}

	private static class SentenceList {
		private ArrayList<SentencePair> list = new ArrayList<SentencePair>();
		private int next;

		public void add(SentencePair sentence) {
			list.add(sentence);
		}

		public boolean hasNext() {
			return next < list.size();
		}

		public SentencePair next() {
			return list.get(next++);
		}

		public int size() {
			return list.size();
		}
	}

	public static void main(String[] args) throws Exception {
		if (args.length != 1) {
			System.err.println("Usage: program <properties_file>");
			System.exit(1);
		}

		SmartProperties props = new SmartProperties(args[0]);

		String encoding = props.getPropertyString("global.encoding");
		File elicitedFile = props.getPropertyFile("paths.translatedCorpusWithContext");
		File phiFile = props.getPropertyFile("paths.phiUntranslatedCorpus");
		File outputFile = props.getPropertyFile("paths.mergedCorpus");
		File maleNamesFile = props.getPropertyFile("paths.maleNamesFile");
		File femaleNamesFile = props.getPropertyFile("paths.femaleNamesFile");
		File meaningFixesFile = props.getPropertyFile("paths.meaningFixes");

		boolean useContext = props.getPropertyBoolean("corpusAligner.useContext");
		boolean noPunc = props.getPropertyBoolean("corpusAligner.ignorePunctuation");
		boolean skipDups = props.getPropertyBoolean("corpusAligner.skipDuplicates");
		System.out.println("Use context = " + useContext);
		System.out.println("Ignore punctuation = " + noPunc);
		System.out.println("Skip duplicates = " + skipDups);

		// first read in male and female names
		maleNames = StringUtils.tokenize(FileUtils.getFileAsString(maleNamesFile), "\n");
		femaleNames = StringUtils.tokenize(FileUtils.getFileAsString(femaleNamesFile), "\n");

		Object2ObjectArrayMap<String, String> meaningFixes = loadMeaningFixes(meaningFixesFile);

		Corpus phiSentencesList = Serializer.loadSentencePairs(phiFile, encoding);
		HashMap<String, SentenceList> phiSentences = new HashMap<String, SentenceList>();
		HashMap<String, SentenceList> phiSentencesContext = new HashMap<String, SentenceList>();

		HashMap<String, SentencePair> usedFeatureStructures = new HashMap<String, SentencePair>();

		for (final SentencePair phiPair : phiSentencesList.getSentences()) {

			String[] tokens = phiPair.getNormalizedSourceTokens();
			if (noPunc) {
				tokens = EnglishTokenizer.removePunctuation(tokens);
			}
			String key = StringUtils.untokenize(tokens);
			key = key.toLowerCase();
			key = replaceNames(key);
			key = applyMeaningFixes(meaningFixes, key);

			String contextKey = key + "|||" + phiPair.getContext();

			SentenceList list;
			if (phiSentences.containsKey(key)) {
				list = phiSentences.get(key);
			} else {
				list = new SentenceList();
				phiSentences.put(key, list);
			}

			SentenceList listContext;
			if (phiSentencesContext.containsKey(contextKey)) {
				listContext = phiSentencesContext.get(contextKey);
			} else {
				listContext = new SentenceList();
				phiSentencesContext.put(contextKey, listContext);
			}

			list.add(phiPair);
			listContext.add(phiPair);
		}

		System.out.println("Read in " + phiSentences.size() + " phi candiates.");

		Corpus elicitedSentences = Serializer.loadSentencePairs(elicitedFile, encoding);
		PrintWriter out = new PrintWriter(outputFile, encoding);

		System.out.println("Read in " + elicitedSentences.getSentences().size()
				+ " elicited sentences to be matched.");

		int nMatched = 0;
		int nUnmatched = 0;
		int nUnmatchedContext = 0;
		int nExhausted = 0;
		int nErrors = 0;
		int nSkipped = 0;
		int nDuplicates = 0;
		int nTotalDuplicates = 0;

		int id = 0;
		for (int i = 0; i < elicitedSentences.getSentences().size(); i++) {
			SentencePair elicitedPair = elicitedSentences.getSentences().get(i);

			// assign ID# to each elicited sentence
			id++;
			elicitedPair.setId(id);

			String[] tokens = elicitedPair.getNormalizedSourceTokens();
			if (noPunc) {
				tokens = EnglishTokenizer.removePunctuation(tokens);
			}
			String key = StringUtils.untokenize(tokens);
			key = key.toLowerCase();
			key = replaceNames(key);
			key = applyMeaningFixes(meaningFixes, key);

			String contextKey = key + "|||" + elicitedPair.getContext();

			SentenceList list;
			SentencePair phiPair;

			if (useContext)
				list = phiSentencesContext.get(contextKey);
			else
				list = phiSentences.get(key);

			if (list == null) {

				// print warning if no match was found
				if (useContext == false) {
					System.out.println("No match found for sentence: " + key + " at "
							+ elicitedPair.getMyLine());
				} else {
					System.out.println("No match found for sentence: " + contextKey + " at "
							+ elicitedPair.getMyLine());

					// try not using context
					list = phiSentences.get(key);
					if (list == null) {
						System.out.println("\tNo match found without context, either.");
						nUnmatched++;
					} else {
						if (list.hasNext()) {
							phiPair = list.next();
							System.out.println("\t***Match found without context: " + key
									+ " with context: " + phiPair.getContext() + " at "
									+ phiPair.getMyLine() + " / " + phiPair.getFeatureSourceLine());
							nUnmatchedContext++;
						} else {
							System.out.println("WARNING: List exhuasted for: " + key);
							nExhausted++;
						}
					}
				}
				System.out.println();

			} else {

				if (list.hasNext() == false) {
					System.out.println("WARNING: List exhuasted for: " + key);
					nExhausted++;
				} else {
					phiPair = list.next();

					if (phiPair.getFeatureStructure().toString().contains("mod-role")) {
						nSkipped++;
						continue;
					}

					if (phiPair.getComment().contains("inaccurate")) {
						nSkipped++;
						continue;
					}

					String strFS = phiPair.getFeatureStructure().toString(LabelDisplay.NONE);
					if (usedFeatureStructures.containsKey(strFS)) {

						SentencePair used = usedFeatureStructures.get(strFS);
						// System.out.println("Duplicate feature structure at: "
						// + phiPair.getMyLine()
						// + " / " + phiPair.getFeatureSourceLine() + "\nAlso
						// used in: "
						// + used.getMyLine() + " / " +
						// used.getFeatureSourceLine());
						nDuplicates++;

						if (Arrays.equals(used.getDisplaySourceTokens(),
								phiPair.getDisplaySourceTokens())) {
							nTotalDuplicates++;
							if (skipDups)
								continue;
						}

						// try passing on this duplicate and see if we can find
						// a non-duplicate
						// i--;
						// id--;
						// continue;
					}
					usedFeatureStructures.put(strFS, phiPair);

					// copy information from elicitedPair to phiPair and write
					phiPair.setId(elicitedPair.getId());
					phiPair.setAlignments(new Alignment("", 0, 0));
					phiPair.setDisplaySourceSentence(elicitedPair.getDisplaySourceSentence());
					phiPair.setDisplayTargetSentence(elicitedPair.getDisplayTargetSentence());
					phiPair.setAlignments(elicitedPair.getDisplayAlignment());
					phiPair.setComment(elicitedPair.getComment());

					phiPair.setComment("");

					ArrayList<TreeNode> terminals =
							phiPair.getSourceConstituentStructure().getTerminalNodes();
					String[] sourceTokens = phiPair.getNormalizedSourceTokens();

					if (terminals.size() - 1 == sourceTokens.length) {
						System.out.println("Attempting to fix length mismatch by adding punctuation: "
								+ StringUtils.untokenize(sourceTokens));
						phiPair.setDisplaySourceSentence(phiPair.getDisplaySourceSentence() + ".");
						sourceTokens = phiPair.getNormalizedSourceTokens();
					}

					if (terminals.size() != sourceTokens.length) {
						System.out.println("ERROR: length mismatch: \n "
								+ StringUtils.untokenize(terminals) + "\n"
								+ StringUtils.untokenize(sourceTokens));
						nErrors++;
						continue;
					}
					for (int j = 0; j < sourceTokens.length; j++) {
						if (!terminals.get(j).getValues().get(1).toLowerCase().equals(
								sourceTokens[j])) {
							terminals.get(j).getValues().set(1, sourceTokens[j]);
						}
					}

					out.println(phiPair.serialize());

					nMatched++;
				}
			}

		}

		out.close();

		System.out.println("Successfully matched " + nMatched + " sentences from "
				+ phiSentencesList.getSentences().size() + " fs/phi mapped sentences and "
				+ elicitedSentences.getSentences().size() + " elicited sentences.");

		double success =
				100 * (double) (nMatched + nSkipped)
						/ (double) elicitedSentences.getSentences().size();
		System.out.println("Success rate: " + FormatUtils.formatDouble2(success) + "%");
		double remaining =
				100 * (double) nMatched / (double) elicitedSentences.getSentences().size();
		System.out.println("Corpus remaining: " + FormatUtils.formatDouble2(remaining) + "%");

		if (nUnmatched > 0)
			System.out.println("Could not match " + nUnmatched + " sentences for lexical items.");
		if (nUnmatchedContext > 0)
			System.out.println("Could not match " + nUnmatchedContext + " sentences for context.");
		if (nDuplicates > 0)
			System.out.println("Detected " + nDuplicates + " duplicate feature structures.");
		if (nTotalDuplicates > 0)
			System.out.println("Detected " + nDuplicates
					+ " duplicate (feature structure, sentence) pairs.");
		if (nExhausted > 0)
			System.out.println("Exhausted list for " + nExhausted + " sentences.");
		if (nErrors > 0)
			System.out.println("Encountered errors for " + nErrors + " sentences.");
		if (nSkipped > 0)
			System.out.println("Skipped " + nSkipped
					+ " sentences according to user specified pattern (mod-role features, etc).");
		System.out.println("Cleared all comments.");
		System.out.println("WROTE MERGED CORPUS FILE: " + outputFile.getAbsolutePath());
	}

	public static Object2ObjectArrayMap<String, String> loadMeaningFixes(File meaningFixesFile)
			throws IOException {
		Object2ObjectArrayMap<String, String> meaningFixes =
				new Object2ObjectArrayMap<String, String>();
		for (String line : StringUtils.tokenize(
				FileUtils.getFileAsString(meaningFixesFile).toLowerCase(), "\n")) {
			String[] tokens = StringUtils.split(line, "|||", 2);
			if (tokens.length == 2)
				meaningFixes.put(tokens[0], tokens[1]);
		}
		return meaningFixes;
	}
}
