package edu.cmu.cs.lti.avenue.navigation.featuredetection.inductive;

import info.jonclark.lang.Pair;
import info.jonclark.properties.PropertyUtils;
import info.jonclark.properties.SmartProperties;
import info.jonclark.stat.RemainingTimeEstimator;
import info.jonclark.stat.SecondTimer;
import info.jonclark.stat.TaskListener;
import info.jonclark.stat.TextProgressBar;
import info.jonclark.util.FileUtils;
import info.jonclark.util.FormatUtils;
import info.jonclark.util.HashUtils;
import info.jonclark.util.MemoryMonitor;
import info.jonclark.util.StringUtils;
import info.jonclark.util.TransformException;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Properties;

import javax.xml.parsers.ParserConfigurationException;

import org.xml.sax.SAXException;

import edu.cmu.cs.lti.avenue.corpus.Corpus;
import edu.cmu.cs.lti.avenue.corpus.CorpusException;
import edu.cmu.cs.lti.avenue.corpus.SentencePair;
import edu.cmu.cs.lti.avenue.corpus.Serializer;
import edu.cmu.cs.lti.avenue.featurespecification.FeatureStructureManager;
import edu.cmu.cs.lti.avenue.morphanalyzer.MorphologicalAnalyzer;
import edu.cmu.cs.lti.avenue.morphology.ParaMor;
import edu.cmu.cs.lti.avenue.morphology.Segmenter;
import edu.cmu.cs.lti.avenue.morphology.SegmenterException;
import edu.cmu.cs.lti.avenue.navigation.featuredetection.inductive.LexicalCluster.LexClusterScores;
import edu.cmu.cs.lti.avenue.navigation.featuredetection.inductive.evidence.FeatureMarking;
import edu.cmu.cs.lti.avenue.navigation.featuredetection.inductive.matrices.TriangularMatrixEvidenceCluster;
import edu.cmu.cs.lti.avenue.navigation.featuredetection.inductive.reports.CsvFeatureDetectionReport;
import edu.cmu.cs.lti.avenue.navigation.featuredetection.inductive.reports.FeatureDetectionReport;
import edu.cmu.cs.lti.avenue.navigation.featuredetection.inductive.reports.LatexFeatureDetectionReport;
import edu.cmu.cs.lti.avenue.navigation.search.generation1.tables.ConfigurationException;
import edu.cmu.cs.lti.avenue.navigation.tools.UtfUtils;

/**
 * The main class for inductive feature detection, which takes in an elicitation
 * corpus and outputs the results in CSV format, which can then be evaluated in
 * various ways. This class can also write out data for a MorphologicalAnalyzer
 * if a ParaMor segmentation model is given as input.
 * 
 * @author jon
 */
public class RulelessFeatureDetector {

	public static void main(String[] args) throws Exception {

		if (args.length != 1) {
			System.err.println("Usage: program <properties_file>");
			System.exit(1);
		}

		// read configuration
		Properties props = PropertyUtils.getProperties(new File(args[0]));
		SmartProperties smartProps = new SmartProperties(props);
		String sourceLanguageName = smartProps.getPropertyString("global.sourceLanguage");
		String targetLanguageName = smartProps.getPropertyString("global.targetLanguage");
		String encoding = smartProps.getPropertyString("global.encoding");

		int screenWidth = smartProps.getPropertyInt("progressBar.screenWidth");

		int nInteractions = smartProps.getPropertyInt("inference.numFeatureInteractions");
		double inferenceExpressionThreshold =
				smartProps.getPropertyFloat("inference.expressionThreshold");
		boolean considerFeatureContext =
				smartProps.getPropertyBoolean("inference.considerFeatureContext");
		boolean useExactMinimalPairs =
				smartProps.getPropertyBoolean("inference.useExactMinimalPairs");

		double themeExpressionThreshold = smartProps.getPropertyFloat("themes.expressionThreshold");
		File xmlFeatureSpec = smartProps.getPropertyFile("paths.featureSpec");
		File elicitedCorpusFile = smartProps.getPropertyFile("paths.mergedCorpus");

		String[] featureWhitelist = smartProps.getPropertyStringArray("inference.featureWhitelist");
		String[] featureBlacklist = smartProps.getPropertyStringArray("inference.featureBlacklist");

		boolean minPairsWithMoreChangesAreLessReliable =
				smartProps.getPropertyBoolean("inference.minPairsWithMoreChangesAreLessReliable");
		boolean scoreThemesWithInvertedVocabulary =
				smartProps.getPropertyBoolean("inference.scoreThemesWithInvertedVocabulary");
		boolean normalizeWeightedFrequencies =
				smartProps.getPropertyBoolean("inference.normalizeWeightedFrequencies");

		LexClusterScores lexicalThresholds = readLexicalThresholds(smartProps);

		boolean allowMarkingsOnMe =
				smartProps.getPropertyBoolean("inference.markings.allowMarkingsOnMe");
		boolean allowMarkingsOnMyDependends =
				smartProps.getPropertyBoolean("inference.markings.allowMarkingsOnMyDependends");
		boolean allowMarkingsOnMyGovernor =
				smartProps.getPropertyBoolean("inference.markings.allowMarkingsOnMyGovernor");
		boolean allowMarkingsOnOthers =
				smartProps.getPropertyBoolean("inference.markings.allowMarkingsOnOthers");
		FeatureMarking.init(allowMarkingsOnMe, allowMarkingsOnMyDependends,
				allowMarkingsOnMyGovernor, allowMarkingsOnOthers);

		HashSet<String> stopwords = initStopwords(smartProps);
		HashSet<String> excludedFeatures = new HashSet<String>(Arrays.asList(featureBlacklist));
		HashSet<String> includedFeatures = new HashSet<String>(Arrays.asList(featureWhitelist));

		// initialize components ---------------------------------

		FeatureStructureManager fsMan = loadFeatureSpec(xmlFeatureSpec);

		// load data
		Corpus elicitationCorpus = loadCorpus(encoding, elicitedCorpusFile);

		HashMap<String, Integer> corpusCounts = analyzeFrequencies(elicitationCorpus);
		HashMap<String, ArrayList<SentencePair>> invertedCorpusVocabulary =
				invertCorpusVocabulary(elicitationCorpus);

		InductiveResultsEvaluator gold = new InductiveResultsEvaluator(smartProps);

		ExpressedThemeManager themeMan =
				new ExpressedThemeManager(corpusCounts, invertedCorpusVocabulary,
						themeExpressionThreshold, minPairsWithMoreChangesAreLessReliable,
						scoreThemesWithInvertedVocabulary, normalizeWeightedFrequencies, gold);

		Segmenter segmenter = loadSegmenter(props);

		int noPenaltyDistance = smartProps.getPropertyInt("inference.noPenaltyDistance");
		int somePenaltyDistance = smartProps.getPropertyInt("inference.somePenaltyDistance");

		LexicalClusterManager lexMan = new LexicalClusterManager(lexicalThresholds, stopwords);
		PlateauFunction distancePenalty =
				new PlateauFunction(noPenaltyDistance, somePenaltyDistance);

		FeatureExpressionGraphManager fegMan =
				new FeatureExpressionGraphManager(inferenceExpressionThreshold, fsMan, segmenter,
						distancePenalty, lexMan, includedFeatures, excludedFeatures, nInteractions,
						considerFeatureContext);

		MinimalPairManager minMan =
				new MinimalPairManager(includedFeatures, excludedFeatures, fsMan,
						useExactMinimalPairs);

		// process ALL the sentences (this can be tweaked for debugging)
		int CUTOFF = elicitationCorpus.getSentences().size();
		CUTOFF = smartProps.getPropertyInt("inference.sentenceCountCutoff");

		// give us some nice visual feedback
		// this can be disabled when running at the console
		RemainingTimeEstimator est = new RemainingTimeEstimator(100);

		// map sentences to which minimal pairs they will be compatible with
		ArrayList<Pair<SentencePair, ArrayList<MinimalPairMapping>>> mappings =
				new ArrayList<Pair<SentencePair, ArrayList<MinimalPairMapping>>>();
		int nMinPairs = 0;
		nMinPairs =
				buildMinPairs(nInteractions, elicitationCorpus, minMan, CUTOFF, est, mappings,
						nMinPairs, screenWidth);

		// construct Feature Expression Graph skeletons, but don't do any
		// inference yet
		buildFEGs(fegMan, est, mappings, nMinPairs, screenWidth);

		// do internal analysis on each internal FEG
		// (if you want to parallelize, this is the place to do it since each
		// FEG is now independent)
		doInferenceAndWriteResults(smartProps, sourceLanguageName, targetLanguageName,
				excludedFeatures, includedFeatures, fsMan, themeMan, fegMan, screenWidth);

		System.out.println("LEXICAL CLUSTERING: CLUSTER COUNT = " + lexMan.getNumClusters());
		System.out.println("THEMES: PERCENT KEPT: "
				+ FormatUtils.formatDouble2(themeMan.getPercentOfThemesKept()) + "%");

		System.out.println("ENTRIES: " + TriangularMatrixEvidenceCluster.nEntries);
		System.out.println("SIM: " + FeatureExpressionGraph.nSim);
		System.out.println("DIF: " + FeatureExpressionGraph.nDif);

		writeMorphAnalyzer(smartProps, themeMan, segmenter);
	}

	public static LexClusterScores readLexicalThresholds(SmartProperties smartProps) {
		LexClusterScores lexicalThresholds = new LexClusterScores();
		lexicalThresholds.useHardThresholds =
				smartProps.getPropertyString("inference.lexicalThresholds.mode").equalsIgnoreCase(
						"hard");
		lexicalThresholds.vocabHits =
				smartProps.getPropertyInt("inference.lexicalThresholds.vocabHits");
		lexicalThresholds.stemmedQuerySize =
				smartProps.getPropertyInt("inference.lexicalThresholds.stemmedQuerySize");
		lexicalThresholds.lengthRatio =
				smartProps.getPropertyFloat("inference.lexicalThresholds.lengthRatio");
		lexicalThresholds.vocabHitPercent =
				smartProps.getPropertyFloat("inference.lexicalThresholds.vocabHitPercent");
		lexicalThresholds.stemmedVocabHitPercent =
				smartProps.getPropertyFloat("inference.lexicalThresholds.stemmedVocabHitPercent");
		lexicalThresholds.cameFromSameFStructFile =
				smartProps.getPropertyFloat("inference.lexicalThresholds.cameFromSameFStructFile");
		lexicalThresholds.hasSameStemmedHead =
				smartProps.getPropertyFloat("inference.lexicalThresholds.hasSameStemmedHead");
		lexicalThresholds.interpolatedScore =
				smartProps.getPropertyFloat("inference.lexicalThresholds.interpolatedScore");
		return lexicalThresholds;
	}

	private static void writeMorphAnalyzer(SmartProperties smartProps,
			ExpressedThemeManager themeMan, Segmenter segmenter) throws FileNotFoundException,
			IOException, SegmenterException {
		System.out.println("Writing morphological analyzer...");
		File tagData = smartProps.getPropertyFile("morphAnalyzer.tagSaveFile");
		MorphologicalAnalyzer morphAnalyzer = themeMan.makeMorphologicalAnalyzer(segmenter);
		morphAnalyzer.saveTaggerData(tagData);
		System.out.println("Wrote morphological analyzer tagging data to : "
				+ tagData.getAbsolutePath());
	}

	public static HashMap<String, ArrayList<SentencePair>> invertCorpusVocabulary(
			Corpus elicitationCorpus) {

		HashMap<String, ArrayList<SentencePair>> result =
				new HashMap<String, ArrayList<SentencePair>>();

		for (SentencePair sent : elicitationCorpus.getSentences()) {
			for (String tok : sent.getNormalizedTargetTokens()) {
				HashUtils.append(result, tok, sent);
			}
		}

		return result;
	}

	private static void doInferenceAndWriteResults(SmartProperties smartProps,
			String sourceLanguageName, String targetLanguageName, HashSet<String> excludedFeatures,
			HashSet<String> includedFeatures, FeatureStructureManager fsMan,
			ExpressedThemeManager themeMan, FeatureExpressionGraphManager fegMan, int screenWidth)
			throws IOException, ParseException, TransformException, SegmenterException,
			OutOfMemoryError, FileNotFoundException, InterruptedException, CorpusException {

		boolean doLatexOutput = smartProps.getPropertyBoolean("inference.doLatexOutput");
		File textResultsFile = smartProps.getPropertyFile("inference.textResultsFile");
		File latexTemplate = smartProps.getPropertyFile("inference.latexTemplate");
		String latexOutputDir = smartProps.getPropertyString("inference.latexOutputDir");
		String latexOutputPrefix = smartProps.getPropertyString("inference.latexOutputPrefix");
		int nMaxDiffEvidence = smartProps.getPropertyInt("inference.maxReportedDifferences");
		int nMaxSimEvidence = smartProps.getPropertyInt("inference.maxReportedSimilarities");
		int nMaxDiffCEvidence =
				smartProps.getPropertyInt("inference.maxReportedCounterDifferences");
		int nMaxSimCEvidence =
				smartProps.getPropertyInt("inference.maxReportedCounterSimilarities");
		boolean reportEmptyCategories =
				smartProps.getPropertyBoolean("inference.reportEmptyCategories");

		// output expressions
		ArrayList<FeatureExpressionGraph> fegs = fegMan.getFeatureExpressionGraphs();

		FeatureDetectionReport report;
		if (doLatexOutput) {
			report =
					new LatexFeatureDetectionReport(latexTemplate, sourceLanguageName,
							targetLanguageName, fsMan, fegMan, themeMan, latexOutputDir,
							latexOutputPrefix, nMaxDiffEvidence, nMaxSimEvidence,
							nMaxDiffCEvidence, nMaxSimCEvidence, reportEmptyCategories,
							excludedFeatures, includedFeatures);
		} else {
			report =
					new CsvFeatureDetectionReport(textResultsFile, reportEmptyCategories, themeMan,
							excludedFeatures, includedFeatures);
		}

		TaskListener bar = new TextProgressBar(System.err, "FEG", 100, screenWidth);
		bar.beginTask(fegs.size());
		for (int step = 8; step <= 8; step++) {

			RemainingTimeEstimator calcEst = new RemainingTimeEstimator(100);
			MemoryMonitor calcMonitor = new MemoryMonitor();
			SecondTimer calcTimer = new SecondTimer(true, true);

			System.out.println("Calculating step " + step + " of expression for " + fegs.size()
					+ " FEG's...");
			int j = 0;
			// for (final FeatureInteraction featureInteraction :
			// fegMan.getFeatureInteractions()) {

			for (final FeatureExpressionGraph feg : fegs) {

				// TODO: We want to cluster on feature context, but not create a
				// separate FEG for each context... yet... and when we do it
				// should be based on contexts relevant to each specific feature

				System.out.println(j + "/" + fegs.size() + ": "
						+ feg.getFeatureInteractions().getName());

				try {
					feg.calculate(step);

					if (step == 8) {
						report.addFEG(feg);
						System.out.println("Calculated step 8 of FEG "
								+ feg.getFeatureInteractions().getName() + " in "
								+ calcTimer.getSecondsFormatted() + " seconds using "
								+ calcMonitor.getUsageDelta());
						feg.releaseStrangleHoldOnMemory();
						System.gc();
						System.out.println("Freed " + calcMonitor.getUsageDelta());
					}
				} catch (OutOfMemoryError e) {
					// System.out.println("ROWS: " +
					// TriangularMatrixEvidenceCluster.nRows);
					// System.out.println("COLS: " +
					// TriangularMatrixEvidenceCluster.nCols);
					System.out.println("ENTRIES: " + TriangularMatrixEvidenceCluster.nEntries);
					System.out.println("SIM: " + FeatureExpressionGraph.nSim);
					System.out.println("DIF: " + FeatureExpressionGraph.nDif);
					throw (e);
				}
				bar.recordEventCompletion();
				calcEst.recordEvent();
				// System.out.println(feg.getFeatureInteractions().getName()
				// +
				// "("
				// +
				// feg.getFeatureInteractions().featureValueInteractions.length
				// +
				// " values)");
				// System.out.println(feg.toString());
				// System.out.println(feg.getExpressionMatrixAsStringTable());
			} // end for fegs
			// } // end for feature interactions
			calcTimer.pause();
			System.out.println("Calculated step " + step + " of expressions in "
					+ calcTimer.getSecondsFormatted() + " seconds using "
					+ calcMonitor.getUsageDelta());
		}
		bar.endTask();
		report.close();
	}

	private static void buildFEGs(FeatureExpressionGraphManager fegMan, RemainingTimeEstimator est,
			ArrayList<Pair<SentencePair, ArrayList<MinimalPairMapping>>> mappings, int nMinPairs,
			int screenWidth) throws CorpusException {

		MemoryMonitor buildMonitor = new MemoryMonitor();
		SecondTimer buildTimer = new SecondTimer(true, true);
		System.out.println("Building feature expression graphs for " + nMinPairs
				+ " minimal pairs...");
		TaskListener bar = new TextProgressBar(System.err, "minPair", 10000, screenWidth);
		bar.beginTask(mappings.size());
		for (final Pair<SentencePair, ArrayList<MinimalPairMapping>> pair : mappings) {
			fegMan.observe(pair.first, pair.second);
			bar.recordEventCompletion();
		}
		bar.endTask();
		System.out.println("Built feature expression graphs in " + buildTimer.getSecondsFormatted()
				+ " seconds using " + buildMonitor.getUsageDelta());

		// output lexical clusters
		// for (final LexicalCluster cluster : fegMan.getLexicalClusters()) {
		// for (final SentencePair pair : cluster.getSentencePairs()) {
		// // System.out.println(pair.toString());
		// }
		// // System.out.println("----------------------------------");
		// }
	}

	private static int buildMinPairs(int nInteractions, Corpus elicitationCorpus,
			MinimalPairManager minMan, final int CUTOFF, RemainingTimeEstimator est,
			ArrayList<Pair<SentencePair, ArrayList<MinimalPairMapping>>> mappings, int nMinPairs,
			int screenWidth) throws CorpusException {

		MemoryMonitor minPairMonitor = new MemoryMonitor();
		SecondTimer minPairTimer = new SecondTimer(true, true);
		TaskListener bar = new TextProgressBar(System.err, "sent", 10000, screenWidth);
		bar.beginTask(elicitationCorpus.getSentences().size());

		for (int i = 0; i < elicitationCorpus.getSentences().size(); i++) {
			ArrayList<MinimalPairMapping> minPairs = new ArrayList<MinimalPairMapping>();
			for (int j = 1; j <= nInteractions; j++) {
				minPairs.addAll(minMan.mapToMinimalPairs(
						elicitationCorpus.getSentences().get(i).getFeatureStructure(), j, true));
			}
			nMinPairs += minPairs.size();
			mappings.add(new Pair<SentencePair, ArrayList<MinimalPairMapping>>(
					elicitationCorpus.getSentences().get(i), minPairs));

			bar.recordEventCompletion();
			if (i == CUTOFF)
				break;
		}
		bar.endTask();
		minPairTimer.pause();
		System.out.println("Built minimal pairs in " + minPairTimer.getSecondsFormatted()
				+ " seconds using " + minPairMonitor.getUsageDelta());
		return nMinPairs;
	}

	public static Segmenter loadSegmenter(Properties props) throws IOException, SegmenterException {

		SmartProperties smartProps = new SmartProperties(props);

		boolean doSegmentation = smartProps.getPropertyBoolean("inference.doSegmentation");

		Segmenter segmenter = null;
		if (doSegmentation) {
			System.out.println("Loading segmenter...");
			segmenter = new ParaMor(props);
			File segmenterData = smartProps.getPropertyFile("morphAnalyzer.segmenterSaveFile");
			segmenter.loadSegmentationModel(segmenterData);

			System.out.println("Using induction corpus as evidence corpus.");
			segmenter.setEvidenceCorpus(new ArrayList<String>(0));
		}
		return segmenter;
	}

	private static FeatureStructureManager loadFeatureSpec(File xmlFeatureSpec) throws IOException,
			SAXException, ParserConfigurationException, ConfigurationException {

		SecondTimer fsTimer = new SecondTimer(true, true);
		FeatureStructureManager fsMan = new FeatureStructureManager(xmlFeatureSpec);
		fsTimer.pause();
		System.err.println("Loaded feature specification in " + fsTimer.getSecondsFormatted()
				+ " seconds.");
		return fsMan;
	}

	private static Corpus loadCorpus(String encoding, File elicitedCorpusFile) throws IOException,
			ParseException {

		// kill unicode chars
		// TODO: Move this option to the config file
		Corpus elicitationCorpus = Serializer.loadSentencePairs(elicitedCorpusFile, true, encoding);
		for (final SentencePair pair : elicitationCorpus.getSentences()) {
			String[] sentence = pair.getNormalizedTargetTokens();
			for (int i = 0; i < sentence.length; i++) {
				sentence[i] = UtfUtils.replaceUnicodeCharsWith(sentence[i], 'x');
			}
		}
		return elicitationCorpus;
	}

	public static HashMap<String, Integer> analyzeFrequencies(Corpus elicitationCorpus) {
		// do some word frequency analysis on the target language
		HashMap<String, Integer> corpusCounts = new HashMap<String, Integer>();
		for (SentencePair sent : elicitationCorpus.getSentences()) {
			for (String word : sent.getNormalizedTargetTokens()) {
				HashUtils.increment(corpusCounts, word);
			}
		}
		return corpusCounts;
	}

	public static HashSet<String> initStopwords(SmartProperties smartProps) throws IOException {

		File functionWordsFile = smartProps.getPropertyFile("paths.functionWordsFile");
		File femaleNamesFile = smartProps.getPropertyFile("paths.femaleNamesFile");
		File maleNamesFile = smartProps.getPropertyFile("paths.maleNamesFile");
		File stopwordsFile = smartProps.getPropertyFile("inference.stopwordsFile");

		HashSet<String> stopwords =
				new HashSet<String>(Arrays.asList(StringUtils.tokenize(FileUtils.getFileAsString(
						functionWordsFile).toLowerCase())));
		stopwords.addAll(Arrays.asList(StringUtils.tokenize(FileUtils.getFileAsString(stopwordsFile).toLowerCase())));
		stopwords.addAll(Arrays.asList(StringUtils.tokenize(FileUtils.getFileAsString(maleNamesFile).toLowerCase())));
		stopwords.addAll(Arrays.asList(StringUtils.tokenize(FileUtils.getFileAsString(
				femaleNamesFile).toLowerCase())));
		return stopwords;
	}
}
