package edu.cmu.cs.lti.avenue.navigation.featuredetection.inductive;

import info.jonclark.util.FormatUtils;
import info.jonclark.util.LatexUtils;
import info.jonclark.util.StringUtils;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.TreeSet;

import edu.cmu.cs.lti.avenue.corpus.CorpusException;
import edu.cmu.cs.lti.avenue.corpus.SentencePair;
import edu.cmu.cs.lti.avenue.trees.smart.TreeNode;

public class LexicalCluster {

	private final ArrayList<SentencePair> pairs = new ArrayList<SentencePair>();
	private final HashSet<String> vocabulary = new HashSet<String>();
	private SentencePair prototype = null;

	public static final double TRUE = 1.0;
	public static final double FALSE = 0.0;
	private static final int LARGE_SIZE = 100000;

	private static final HashSet<SentencePair> errors = new HashSet<SentencePair>();

	public static class LexClusterScores {
		public boolean useHardThresholds;
		public int vocabHits;
		public int stemmedQuerySize;
		public double vocabHitPercent;
		public double stemmedVocabHitPercent;
		public double stemmingAttritionRate;
		public double lengthRatio;
		public double hasSameStemmedHead;
		public double cameFromSameFStructFile;
		public double interpolatedScore;
	}

	private final LexClusterScores thresholds;

	// TODO: Take these elements out as their own configuration class
	private final HashSet<String> stopwords;
	private static final String[] SUFFIXES =
			{ "ies", "es", "s", "e", "y", "ed", "ning", "ing", "n't" };
	private static final HashMap<String, String> IRREGULAR_MAP = new HashMap<String, String>();
	static {
		// irregular verb forms
		IRREGULAR_MAP.put("said", "say");
		IRREGULAR_MAP.put("ate", "eat");
		IRREGULAR_MAP.put("left", "leav");
		IRREGULAR_MAP.put("made", "mak");
		IRREGULAR_MAP.put("brok", "break");
		IRREGULAR_MAP.put("sang", "sing");
		IRREGULAR_MAP.put("sung", "sing");
		IRREGULAR_MAP.put("drank", "drink");
		IRREGULAR_MAP.put("wrote", "write");
		IRREGULAR_MAP.put("gave", "give");
		IRREGULAR_MAP.put("letting", "let");
		IRREGULAR_MAP.put("won't", "willn't");

		// gender-specific words
		IRREGULAR_MAP.put("policemen", "policeman");
		IRREGULAR_MAP.put("policewoman", "policeman");
		IRREGULAR_MAP.put("policewomen", "policeman");

		IRREGULAR_MAP.put("women", "man");
		IRREGULAR_MAP.put("men", "man");
		IRREGULAR_MAP.put("girl", "boy");
		IRREGULAR_MAP.put("sister", "brother");

		// also include pronouns here
		IRREGULAR_MAP.put("he", "pro");
		IRREGULAR_MAP.put("she", "pro");
		IRREGULAR_MAP.put("it", "pro");
		IRREGULAR_MAP.put("they", "pro");
		IRREGULAR_MAP.put("i", "pro");
		IRREGULAR_MAP.put("we", "pro");
		IRREGULAR_MAP.put("you", "pro");

		// accusative pronouns
		IRREGULAR_MAP.put("them", "pro");
		IRREGULAR_MAP.put("us", "pro");
		IRREGULAR_MAP.put("her", "pro");
		IRREGULAR_MAP.put("him", "pro");

		// forms of be
		IRREGULAR_MAP.put("am", "be");
		IRREGULAR_MAP.put("is", "be");
		IRREGULAR_MAP.put("are", "be");
		IRREGULAR_MAP.put("was", "be");
		IRREGULAR_MAP.put("were", "be");
		IRREGULAR_MAP.put("being", "be");
		IRREGULAR_MAP.put("been", "be");
	}

	public LexicalCluster(LexClusterScores thresholds, HashSet<String> stopwords,
			SentencePair firstPair) {
		this.thresholds = thresholds;
		this.stopwords = stopwords;

		pairs.add(firstPair);
		prototype = firstPair;
		ArrayList<String> stemmed = normalize(firstPair.getNormalizedSourceTokens());
		for (final String stemmedWord : stemmed) {
			if (!stopwords.contains(stemmedWord)) {
				vocabulary.add(stemmedWord);
			}
		}
	}

	public void addSentence(SentencePair pair) {
		pairs.add(pair);
	}

	private String getHead(SentencePair pair) throws CorpusException {

		TreeNode head =
				pair.getPhiPlusMapping().getUltimateHead(pair,
						pair.getSourceConstituentStructure().getRootNode());

		String strHead = normalize(head.getYield());

		if (strHead.equals("")) {
			String strBackoffHead = replaceIrregularAndGenderInflectedForms(head.getYield());
			if (errors.contains(pair) == false && strBackoffHead.equals("be") == false) {
				System.out.println("LEXICAL CLUSTERING: blank head after normalization: "
						+ pair.getId() + " " + pair.toString() + " -- BACKOFF: " + strBackoffHead);
				errors.add(pair);
			}
			strHead = strBackoffHead;
		}

		return strHead;
	}

	protected LexClusterScores getScore(SentencePair pair) throws CorpusException {

		int hits = 0;

		ArrayList<String> stemmed = normalize(pair.getNormalizedSourceTokens());
		for (final String stemmedWord : stemmed) {
			if (vocabulary.contains(stemmedWord)) {
				hits++;
			}
		}

		String strQueryHead = getHead(pair);
		String strPrototypeHead = getHead(prototype);

		// match on source file name
		String strSourceFile1 = "";
		String strSourceFile2 = "";

		if (pair.getFeatureSourceLine() != null && prototype.getFeatureSourceLine() != null) {
			strSourceFile1 = StringUtils.substringBefore(pair.getFeatureSourceLine(), ":");
			strSourceFile2 = StringUtils.substringBefore(prototype.getFeatureSourceLine(), ":");
		} else {
			System.err.println("LEXICAL CLUSTERING: NO FEATURE SOURCE LINE FOR " + pair.getMyLine());
		}

		LexClusterScores score = new LexClusterScores();
		score.vocabHits = hits;

		score.vocabHitPercent = (double) hits / (double) vocabulary.size();
		score.stemmedVocabHitPercent = (double) hits / (double) stemmed.size();
		score.stemmingAttritionRate = 1 / ((double) stemmed.size() / (double) vocabulary.size());
		score.lengthRatio =
				1 / ((double) pair.getNormalizedSourceTokens().length / (double) prototype.getNormalizedSourceTokens().length);
		score.hasSameStemmedHead = strQueryHead.equals(strPrototypeHead) ? TRUE : FALSE;
		score.cameFromSameFStructFile = strSourceFile1.equals(strSourceFile2) ? TRUE : FALSE;

		if (score.hasSameStemmedHead == FALSE) {
//			System.out.println("HEAD MISMATCH: " + strQueryHead + " VERSUS " + strPrototypeHead);
		}

		score.interpolatedScore =
				thresholds.vocabHitPercent * score.vocabHitPercent
						+ thresholds.stemmedVocabHitPercent * score.stemmedVocabHitPercent
						+ thresholds.stemmingAttritionRate * score.stemmingAttritionRate
						+ thresholds.lengthRatio * score.lengthRatio
						+ thresholds.hasSameStemmedHead * score.hasSameStemmedHead
						+ thresholds.cameFromSameFStructFile * score.cameFromSameFStructFile;

		return score;
	}

	public boolean isLike(SentencePair pair) throws CorpusException {
		LexClusterScores score = getScore(pair);
		if (thresholds.hasSameStemmedHead == TRUE && score.hasSameStemmedHead == FALSE) {
			return false;
		}
		if (thresholds.cameFromSameFStructFile == TRUE && score.cameFromSameFStructFile == FALSE) {
			return false;
		}

		if (thresholds.useHardThresholds) {
			return (score.vocabHits >= thresholds.vocabHits
					&& score.vocabHitPercent >= thresholds.stemmedVocabHitPercent
					&& score.stemmedVocabHitPercent >= thresholds.stemmedVocabHitPercent
					&& score.stemmingAttritionRate >= thresholds.stemmingAttritionRate && score.lengthRatio >= thresholds.lengthRatio);
		} else {
			return (score.interpolatedScore >= thresholds.interpolatedScore);
		}
	}

	public String normalize(String word) {
		List<String> normalized = normalize(new String[] { word });
		if (normalized.size() > 0) {
			return normalized.get(0);
		} else {
			return "";
		}
	}

	public ArrayList<String> normalize(String[] sentence) {

		ArrayList<String> stemmed = new ArrayList<String>(sentence.length);
		for (final String word : sentence) {
			if (!stopwords.contains(word)) {
				stemmed.add(word);
			}
		}

		for (int i = 0; i < stemmed.size(); i++) {
			String input = stemmed.get(i);

			String result;

			while (true) {
				result = replaceIrregularAndGenderInflectedForms(input);
				if (result.equals(input))
					break;
				input = result;
			}

			while (true) {
				result = removeSuffixes(input);
				if (result.equals(input))
					break;
				input = result;
			}

			stemmed.set(i, result);
		}

		return stemmed;
	}

	private String removeSuffixes(String word) {
		for (final String suffix : SUFFIXES) {
			word = StringUtils.removeTrailingString(word, suffix);
		}
		return word;
	}

	private String replaceIrregularAndGenderInflectedForms(String word) {
		String irregularForm = word;
		String regularForm = IRREGULAR_MAP.get(irregularForm);
		if (regularForm != null) {
			// we mapped this word from female to male
			return regularForm;
		} else {
			// this was not a female word
			return word;
		}
	}

	public String toString() {
		TreeSet<String> sorted = new TreeSet<String>(vocabulary);
		return StringUtils.untokenize(sorted, "-");
	}

	public ArrayList<SentencePair> getSentencePairs() {
		return pairs;
	}

	private static String clean(String str) {
		return LatexUtils.replaceLatexKillers(str);
	}

	public String toLatexString() throws CorpusException {
		StringBuilder sentencesBuilder = new StringBuilder(LARGE_SIZE);
		for (final SentencePair sentence : getSentencePairs()) {
			LexClusterScores score = getScore(sentence);
			String strScore =
					"\\textbf{"
							+ clean(FormatUtils.formatDouble2(score.vocabHits * 100) + " / "
									+ FormatUtils.formatDouble2(score.stemmedVocabHitPercent * 100)
									+ " / "
									+ FormatUtils.formatDouble2(score.stemmingAttritionRate * 100)
									+ " / " + FormatUtils.formatDouble2(score.lengthRatio * 100)
									+ " / " + score.cameFromSameFStructFile) + "}";
			sentencesBuilder.append(clean(sentence.toString()) + " " + strScore + " \\\\ \n");
		}
		return sentencesBuilder.toString();
	}
}
