package edu.cmu.cs.lti.avenue.navigation.featuredetection.inductive;

import info.jonclark.util.FormatUtils;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Set;

import edu.cmu.cs.lti.avenue.corpus.SentencePair;
import edu.cmu.cs.lti.avenue.featurespecification.FeatureValueSpec;
import edu.cmu.cs.lti.avenue.morphanalyzer.MorphologicalAnalyzer;
import edu.cmu.cs.lti.avenue.morphology.Paradigm;
import edu.cmu.cs.lti.avenue.morphology.Segmenter;
import edu.cmu.cs.lti.avenue.morphology.SegmenterException;
import edu.cmu.cs.lti.avenue.navigation.featuredetection.inductive.evidence.ArcEvidenceCluster;
import edu.cmu.cs.lti.avenue.navigation.featuredetection.inductive.evidence.FeatureMarking;
import edu.cmu.cs.lti.avenue.navigation.featuredetection.inductive.evidence.NodeEvidence;
import edu.cmu.cs.lti.avenue.navigation.featuredetection.inductive.evidence.ObservedMorpheme;
import edu.cmu.cs.lti.avenue.trees.smart.SmartTree;
import edu.cmu.cs.lti.avenue.trees.smart.TreeNode;

/**
 * The collecting point for morpheme-feature pairing data and a key component in
 * the MorphologicalAnalyzer. Determines which words are most likely to be
 * associated with a given FeatureValueCluster. This is a small footprint
 * object, which should likely have only one global instance.
 */
public class ExpressedThemeManager {

	private static final long serialVersionUID = -1743207015657595506L;
	private final double expressionThreshold;
	private final HashMap<String, Integer> corpusCounts;
	private final HashMap<String, ArrayList<SentencePair>> invertedCorpusVocabulary;
	private final boolean minPairsWithMoreChangesAreLessReliable;
	private final boolean scoreThemesWithInvertedVocabulry;
	private final boolean normalizeWeightedFrequencies;
	private final InductiveResultsEvaluator gold;

	private int allKeptThemes = 0;
	private int allTotalThemes = 0;

	private transient final HashMap<FeatureValueCluster, Collection<ExpressedTheme>> expressedThemes =
			new HashMap<FeatureValueCluster, Collection<ExpressedTheme>>();

	private final HashMap<String, HashMap<ObservedMorpheme, String>> morphTable =
			new HashMap<String, HashMap<ObservedMorpheme, String>>();

	public ExpressedThemeManager(HashMap<String, Integer> corpusCounts,
			HashMap<String, ArrayList<SentencePair>> invertedCorpusVocabulary,
			double expressionThreshold, boolean minPairsWithMoreChangesAreLessReliable,
			boolean scoreThemesWithInvertedVocabulry, boolean normalizeWeightedFrequencies,
			InductiveResultsEvaluator gold) {

		this.expressionThreshold = expressionThreshold;
		this.invertedCorpusVocabulary = invertedCorpusVocabulary;
		this.corpusCounts = corpusCounts;
		this.minPairsWithMoreChangesAreLessReliable = minPairsWithMoreChangesAreLessReliable;
		this.scoreThemesWithInvertedVocabulry = scoreThemesWithInvertedVocabulry;
		this.normalizeWeightedFrequencies = normalizeWeightedFrequencies;
		this.gold = gold;
	}

	public String[] getExpressedFeatureNames() {
		return morphTable.keySet().toArray(new String[morphTable.size()]);
	}

	/**
	 * Returns a map keyed on ObservedMorpheme and returns a String, which is
	 * the feature value associated with this morpheme.
	 * 
	 * @param featureName
	 * @return
	 */
	public HashMap<ObservedMorpheme, String> getMorphemeFeatureValuePairingsFor(String featureName) {
		return morphTable.get(featureName);
	}

	/**
	 * Get the expressed themes for a valueCluster, which has previously been
	 * added to this ExpressedThemeManager.
	 * 
	 * @param valueCluster
	 * @return
	 */
	public Collection<ExpressedTheme> getExpressedThemes(FeatureValueCluster valueCluster) {
		return expressedThemes.get(valueCluster);
	}

	public MorphologicalAnalyzer makeMorphologicalAnalyzer(Segmenter segmenter) {
		return new MorphologicalAnalyzer(this, segmenter);
	}

	private void addObservedMorpheme(String featureName, String featureValue, ObservedMorpheme morph) {

		HashMap<ObservedMorpheme, String> morphs = morphTable.get(featureName);
		if (morphs == null) {
			morphs = new HashMap<ObservedMorpheme, String>();
			morphTable.put(featureName, morphs);
		}
		morphs.put(morph, featureValue);
	}

	// TODO: Make this morpheme-based
	private double getPercentOfSentencesWhereThisWordHasOneOfTheseMeanings(String word,
			Set<String> competingWords, List<FeatureValueSpec> featureValues) {

		List<SentencePair> sentences = invertedCorpusVocabulary.get(word);
		int matches = 0;
		assert sentences != null : "You didn't use the right corpus to invert the vocabulary.";
		for (SentencePair sent : sentences) {

			// make sure this sentence doesn't contain any other candidate words
			for (String otherWord : sent.getNormalizedTargetTokens()) {
				if (competingWords.contains(otherWord)) {
					continue;
				}
			}

			SmartTree featureStructure = sent.getFeatureStructure();
			List<TreeNode> leaves = featureStructure.getTerminalNodes();
			for (TreeNode leaf : leaves) {
				boolean found = false;
				if (leaf.getValues().size() == 2) {
					String leafValue = leaf.getValues().get(1);
					for (FeatureValueSpec spec : featureValues) {
						if (spec.getName().equals(leafValue)) {
							matches++;
							found = true;
							break;
						}
					}

				}
				if (found)
					break;
			}
		}
		System.out.println("CORRELATION: " + word + " as " + featureValues + " = " + matches
				+ " / " + sentences.size());
		double percent = (double) matches / (double) sentences.size();
		return percent;
	}

	public void addFeatureValueCluster(FeatureValueCluster valueCluster) throws SegmenterException {

		String featureName = valueCluster.getParentFeatureNames();
		String valueName = valueCluster.getShortName();

		HashMap<String, ExpressedTheme> themes = new HashMap<String, ExpressedTheme>();

		Collection<NodeEvidence> morphemes = valueCluster.getWords().values();
		double sumOfWeightedFrequencies = 0;
		for (NodeEvidence nodeEvidence : morphemes) {

			Collection<ObservedMorpheme> addedMorphemes = nodeEvidence.getAddedMorphemes();
			for (ObservedMorpheme addedMorpheme : addedMorphemes) {

				addObservedMorpheme(featureName, valueName, addedMorpheme);

				for (Paradigm p : addedMorpheme.paradigms) {
					System.out.print("Found paradigm for expressed theme: " + p + " ------- ");
				}

				ExpressedTheme theme = themes.get(addedMorpheme);
				if (theme == null) {
					theme = new ExpressedTheme(addedMorpheme);
					themes.put(addedMorpheme.morpheme, theme);
				}

				double weight = (double) nodeEvidence.getWeightedFrequency();
				if (minPairsWithMoreChangesAreLessReliable) {
					// this weighting implies that sentences with more changed
					// words between them are less reliable
					weight /= (double) addedMorphemes.size();
				}

				// accumulate frequencies from each NodeEvidence instance
				theme.setWeightedFrequency(theme.getWeightedFrequency() + weight);
				theme.getEvidence().add(nodeEvidence);
				sumOfWeightedFrequencies += weight;
			}
		}

		// for (NodeEvidence nodeEvidence : morphemes) {
		// Collection<String> addedWords = nodeEvidence.getAddedWords();
		// for (String addedWord : addedWords) {
		// ExpressedTheme theme = themes.get(addedWord);
		// if (theme == null) {
		// theme = new ExpressedTheme(addedWord);
		// themes.put(addedWord, theme);
		// }
		// double weight = 1.0;
		// theme.weightedFrequency += weight;
		// theme.evidence.add(nodeEvidence);
		// }
		// }

		// calculate probabilities and entropy
		final int nTotalThemes = themes.size();
		int nProbs = 0;
		double logprobSum = 0.0;
		ArrayList<ExpressedTheme> values = new ArrayList<ExpressedTheme>(themes.values());
		for (int i = 0; i < values.size(); i++) {
			ExpressedTheme value = values.get(i);

			// TODO: Make this morpheme-based instead of word-based
			double percent =
					getPercentOfSentencesWhereThisWordHasOneOfTheseMeanings(
							value.getMorpheme().morpheme, valueCluster.getWords().keySet(),
							valueCluster.getFeatureValues());
			double prob = (double) value.getWeightedFrequency();
			if (normalizeWeightedFrequencies) {
				prob /= sumOfWeightedFrequencies;
			}

			if (scoreThemesWithInvertedVocabulry) {
				value.setProbability(percent);
			} else {
				value.setProbability(prob);
			}

			System.out.print("THEME: " + valueCluster.getShortName() + " ==> "
					+ value.getMorpheme().morpheme + " = "
					+ FormatUtils.formatDouble4(value.getProbability())
					+ " (probNodeEvidenceSupportingThisWord = " + FormatUtils.formatDouble4(prob)
					+ "; percentOfSentencesWhereThisWordHasOneOfTheseMeanings = "
					+ FormatUtils.formatDouble4(percent) + ") ... ");

			if (value.getProbability() < expressionThreshold) {
				System.out.println("Pruned.");
				values.remove(i);
				i--;
			} else {
				System.out.println("Kept.");

				double logprob = Math.log(value.getProbability());
				logprobSum += logprob;
				nProbs++;

				String correct = ">>INCORRECT<<";
				for (FeatureValueSpec spec : valueCluster.getFeatureValues()) {
					InductiveResultRow goldRow = gold.getRowForFeatureValue(spec.getName());
					if (goldRow != null) {
						for (String goldMorpheme : goldRow.morphemes) {
							if (value.getMorpheme().morpheme.equals(goldMorpheme)) {
								correct = "++GOOD++";
							}
						}
					}
				}

				System.out.println("EVIDENCE: " + valueCluster.getShortName() + " ==> "
						+ value.getMorpheme().morpheme + " = "
						+ FormatUtils.formatDouble4(value.getProbability())
						+ " (probNodeEvidenceSupportingThisWord = "
						+ FormatUtils.formatDouble4(prob)
						+ "; percentOfSentencesWhereThisWordHasOneOfTheseMeanings = "
						+ FormatUtils.formatDouble4(percent) + ") ... Evidence Sentences for "
						+ valueName + " for theme (morpheme) = \"" + value.getMorpheme().morpheme
						+ "\", MARKED ON: " + value.getMorpheme().getMarkedOnString()
						+ " -------------- (WEIGHTED) FREQUENCY = " + value.getWeightedFrequency()
						+ " " + correct);

				for (NodeEvidence nodeEvidence : value.getEvidence()) {
					List<ArcEvidenceCluster<FeatureMarking>> arcs =
							nodeEvidence.getArcEvidenceClusters();

					// just print info about the first arc
					ArcEvidenceCluster<FeatureMarking> arc = arcs.get(0);
					System.out.println(arc.getPairA().getId() + ") " + arc.getPairA() + "\n"
							+ arc.getPairB().getId() + ") " + arc.getPairB());
				}
				System.out.println();
			}
		}
		final int nKeptThemes = values.size();
		double retained = (double) nKeptThemes / (double) nTotalThemes * 100;
		System.out.println("THEMES: Kept " + nKeptThemes + "/" + nTotalThemes + " themes ("
				+ FormatUtils.formatDouble2(retained) + "% retention) for value cluster: "
				+ valueCluster);
		double entropy = -logprobSum / nProbs;
		double perplexity = Math.pow(2, entropy);
		this.allKeptThemes += nKeptThemes;
		this.allTotalThemes += nTotalThemes;

		// TODO: For each sentence in evidence, for each feature in sentence,
		// attempt to partition the theme

		// or do we instead want to re-cluster based on morpheme-to-feature
		// mappings
		// and then try to run a decision tree from there?

		// Shall we do partitioning to determine the initial probabilities?
		// If so, what is our baseline model?

		// skip n/a features

		Collections.sort(values, new Comparator<ExpressedTheme>() {
			public int compare(ExpressedTheme o1, ExpressedTheme o2) {
				double diff = o2.getWeightedFrequency() - o1.getWeightedFrequency();
				if (diff > 0)
					return 1;
				else if (diff < 0)
					return -1;
				else
					return 0;
			}
		});

		this.expressedThemes.put(valueCluster, values);
	}

	public double getPercentOfThemesKept() {
		return (double) allKeptThemes / (double) allTotalThemes * 100;
	}
}
