package edu.cmu.cs.lti.avenue.navigation.search.oracle;

import info.jonclark.util.ArrayUtils;
import info.jonclark.util.FormatUtils;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;

import edu.cmu.cs.lti.avenue.atavi.AtaviWrapper;
import edu.cmu.cs.lti.avenue.corpus.Corpus;
import edu.cmu.cs.lti.avenue.corpus.CorpusException;
import edu.cmu.cs.lti.avenue.corpus.SentencePair;
import edu.cmu.cs.lti.avenue.corpus.Serializer;
import edu.cmu.cs.lti.avenue.navigation.search.oracle.cfglm.CfgLanguageModel;
import edu.cmu.cs.lti.avenue.projection.ConstituentStructureProjector;
import edu.cmu.cs.lti.avenue.projection.ProjectionAnalyzer;
import edu.cmu.cs.lti.avenue.projection.ProjectionConstraints;
import edu.cmu.cs.lti.avenue.projection.ordering.EndpointOrderingModel;
import edu.cmu.cs.lti.avenue.projection.ordering.OrderingModel;
import edu.cmu.cs.lti.avenue.trees.cfg.SyncCfgRule;

/**
 * A method of "navigating" (selecting sentences) that does not rely on feature
 * detection or linguistic knowledge sources, but instead selects sentences from
 * MT output by comparing to references and determining which ones we are most
 * likely to be able to improve (those that have good lexical choices, but poor
 * ordering choices). These sentences are then sent to a bilingual person for
 * retranslation and then fed back in the MT training process in hopes of
 * correcting those errors. This method was developed for the post-NIST MT Eval
 * 2008 experiment and the data was sent to three external research groups, but
 * the results were never received from those groups.
 * 
 * @author jon
 */
public class NavigatorOracle {

	private static class OracleHypothesis {
		public SentencePair pair;
		public int nNovelRules;
		public double score;
		public ArrayList<SyncCfgRule> rules;
		public ArrayList<SyncCfgRule> novelRules = new ArrayList<SyncCfgRule>(15);

		public OracleHypothesis(SentencePair pair) {
			this.pair = pair;
		}
	}

	private final ArrayList<OracleHypothesis> remainingHypotheses;
	private final HashSet<SyncCfgRule> knownRules = new HashSet<SyncCfgRule>();
	private final ArrayList<OracleHypothesis> selectedHypotheses =
			new ArrayList<OracleHypothesis>();
	private final CfgLanguageModel cfgLanguageModel;

	public NavigatorOracle(ArrayList<SentencePair> sentences, File cfgLanguageModelData)
			throws CorpusException, IOException {

		OrderingModel orderingModel = new EndpointOrderingModel(true);
		ProjectionConstraints projectionConstraints = new ProjectionConstraints();
		ConstituentStructureProjector projector =
				new ConstituentStructureProjector(orderingModel, projectionConstraints);

		remainingHypotheses = new ArrayList<OracleHypothesis>(sentences.size());
		for (int i = 0; i < sentences.size(); i++) {
			SentencePair pair = sentences.get(i);
			pair.setTargetConstituentStructure(projector.project(pair));
			remainingHypotheses.add(new OracleHypothesis(pair));
		}

		cfgLanguageModel = new CfgLanguageModel(cfgLanguageModelData);
	}

	private void commitHypothesis(OracleHypothesis hypothesis) {
		selectedHypotheses.add(hypothesis);
		remainingHypotheses.remove(hypothesis);
		for (final SyncCfgRule rule : hypothesis.rules) {
			knownRules.add(rule);
		}
	}

	public void scoreHypotheses(ArrayList<OracleHypothesis> remainingHypotheses) {

		for (final OracleHypothesis hypothesis : remainingHypotheses) {
			if (hypothesis.rules == null) {
				hypothesis.rules = ProjectionAnalyzer.getLearnedRules(hypothesis.pair, true);
			}

			HashSet<SyncCfgRule> alreadyInThisHypothesis = new HashSet<SyncCfgRule>();

			hypothesis.novelRules.clear();
			hypothesis.nNovelRules = 0;
			hypothesis.score = 0;
			for (final SyncCfgRule rule : hypothesis.rules) {
				if (!knownRules.contains(rule) && !alreadyInThisHypothesis.contains(rule)) {
					hypothesis.nNovelRules++;
					hypothesis.novelRules.add(rule);
					hypothesis.score += cfgLanguageModel.getScore(rule.getSourceRule());
					alreadyInThisHypothesis.add(rule);
				}
			}
		}
	}

	public ArrayList<OracleHypothesis> search(int nBest, int nThreshold) {

		boolean done = false;
		while (!done) {
			scoreHypotheses(remainingHypotheses);
			int nMax =
					ArrayUtils.indexOfMax(remainingHypotheses, 1,
							new Comparator<OracleHypothesis>() {
								public int compare(OracleHypothesis o1, OracleHypothesis o2) {
									double val = o1.score - o2.score;
									if (val < 0.0) {
										return -1;
									} else if (val > 0.0) {
										return 1;
									} else {
										return 0;
									}
								}
							});
			OracleHypothesis max = remainingHypotheses.get(nMax);

			if (max.nNovelRules < nThreshold) {
				done = true;
			} else {
				commitHypothesis(max);
				System.out.println("Selected sentence with " + max.nNovelRules
						+ " novel rules with score " + max.score);
			}

			if (selectedHypotheses.size() == nBest) {
				done = true;
			}
		}

		return selectedHypotheses;
	}

	public static void removeUnalignedSentences(ArrayList<SentencePair> sentences) {
		for (int i = 0; i < sentences.size(); i++) {
			if (sentences.get(i).getDisplayAlignment().getRawAlignments().length == 0) {
				sentences.remove(i);
				i--;
			}
		}
	}

	public static void main(String[] args) throws Exception {

		if (args.length != 5) {
			System.err.println("Usage: program <in_file> <cfg_lm_data> <n_best> <n_threshold> <out_file>");
			System.exit(1);
		}

		String encoding = "UTF-8";

		File outDir = new File("/Users/jon/Documents/workspace/letras/ATAVI/Corpus/Sentences");

		// TODO: Rank based on structure vs other rules as vamshi does
		// TODO: Does is affect Urdu TO ENGLISH translation?

		// First, project syntax trees for all sentences, extracting features
		// along the way
		Corpus corpus = Serializer.loadSentencePairs(new File(args[0]), encoding);
		File cfgLanguageModelData = new File(args[1]);
		int nBest = Integer.parseInt(args[2]);
		int nThreshold = Integer.parseInt(args[3]);
		File outFile = new File(args[4]);

		removeUnalignedSentences(corpus.getSentences());

		final NavigatorOracle oracle =
				new NavigatorOracle(corpus.getSentences(), cfgLanguageModelData);
		ArrayList<OracleHypothesis> selectedHypotheses = oracle.search(nBest, nThreshold);

		System.out.println("Selected " + selectedHypotheses.size() + " sentences out of "
				+ corpus.getSentences().size());

		int i = 0;
		PrintWriter out = new PrintWriter(outFile, "UTF-8");
		for (final OracleHypothesis hypothesis : selectedHypotheses) {

			out.println("newpair");
			out.println("srcsent: " + hypothesis.pair.getDisplaySourceSentence());
			out.println("tgtsent: " + hypothesis.pair.getDisplayTargetSentence());
			out.println("aligned: " + hypothesis.pair.getDisplayAlignment());
			out.println("context: " + hypothesis.pair.getContext());
			out.println("comment: " + hypothesis.pair.getComment());
			out.println();

			StringBuilder comments = new StringBuilder();
			comments.append(AtaviWrapper.getBasicSentenceComments(hypothesis.pair));
			comments.append("``PROBABILITY MASS'' (SCORE): "
					+ FormatUtils.formatDoubleExp(hypothesis.score) + "\n");
			comments.append("NOVEL RULES: " + hypothesis.nNovelRules + "\n");

			// sort novel rules by their score (NOTE: This is very inefficient)
			Collections.sort(hypothesis.novelRules, new Comparator<SyncCfgRule>() {
				public int compare(SyncCfgRule o1, SyncCfgRule o2) {
					double score1 = oracle.cfgLanguageModel.getScore(o1.getSourceRule());
					double score2 = oracle.cfgLanguageModel.getScore(o2.getSourceRule());
					if (score1 < score2) {
						return 1;
					} else if (score2 < score1) {
						return -1;
					} else {
						return 0;
					}
				}
			});

			for (final SyncCfgRule novelRule : hypothesis.novelRules) {
				comments.append("NOVEL RULE: "
						+ novelRule.toString(false)
						+ "(PennTreeProb: "
						+ FormatUtils.formatDoubleExp(oracle.cfgLanguageModel.getScore(novelRule.getSourceRule()))
						+ ")\n");
			}
			comments.append(AtaviWrapper.getProjectionFeaturesComments(hypothesis.pair));

			for (final SyncCfgRule rule : hypothesis.rules) {
				if (!hypothesis.novelRules.contains(rule)) {
					comments.append("OLD RULE:  " + rule.toString() + "\n");
				}
			}
			for (final String rule : ProjectionAnalyzer.getFailedRules(hypothesis.pair)) {
				comments.append("FAILED RULE:  " + rule + "\n");
			}
			comments.append("LINKS: ");
			for (final String link : ProjectionAnalyzer.getLearnedLinks(hypothesis.pair)) {
				comments.append(link + ", ");
			}
			comments.append("\n");

			AtaviWrapper.writeAtaviSentence(outDir, i, hypothesis.pair, comments.toString());
			i++;
		}
		out.close();

		// 1) Use greedy algorithm to rank realized RULE diversity
		// 2) incorporate frequency of PennTreebank source frequencies for
		// realized rules
		// 3) incorporate blending of other projection features
		// 4) rank frequency of lexicalized words (n-gram counts from penn
		// treebank) (incorporate LM smoothing model)
		// 5) Incorporate ruleless feature detection
	}
}
