package edu.cmu.cs.lti.avenue.morphanalyzer;

import info.jonclark.properties.PropertyUtils;
import info.jonclark.properties.SmartProperties;
import info.jonclark.util.StringUtils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Properties;
import java.util.Map.Entry;

import edu.cmu.cs.lti.avenue.morphology.ParaMor;
import edu.cmu.cs.lti.avenue.morphology.Paradigm;
import edu.cmu.cs.lti.avenue.morphology.Segmenter;
import edu.cmu.cs.lti.avenue.morphology.SegmenterException;
import edu.cmu.cs.lti.avenue.navigation.featuredetection.inductive.ExpressedThemeManager;
import edu.cmu.cs.lti.avenue.navigation.featuredetection.inductive.evidence.ObservedMorpheme;

public class MorphologicalAnalyzer {

	private final String[] featureNames;
	private final HashMap<ObservedMorpheme, String>[] featureTable;
	private final Segmenter segmenter;

	private int nConflicts = 0;

	private static final String CONFLICT = "CONFLICT";
	private static final String NA = "N/A";

	public MorphologicalAnalyzer(ExpressedThemeManager themes, Segmenter segmenter) {
		this.segmenter = segmenter;

		this.featureNames = themes.getExpressedFeatureNames();
		this.featureTable = new HashMap[featureNames.length];
		for (int i = 0; i < featureNames.length; i++) {
			this.featureTable[i] = themes.getMorphemeFeatureValuePairingsFor(featureNames[i]);
		}
	}

	private MorphologicalAnalyzer(String[] featureNames,
			HashMap<ObservedMorpheme, String>[] featureTable, Segmenter segmenter) {
		this.featureNames = featureNames;
		this.featureTable = featureTable;
		this.segmenter = segmenter;
	}

	public String[] analyze(String token) throws SegmenterException {

		final int nNonFeatureFactors = 3;
		final int nFactors = nNonFeatureFactors + featureNames.length;

		String[] analysis = new String[nFactors];

		// factor 0 = surface form
		analysis[0] = token;

		// factor 1 = lemma
		String[] segmentedWord = segmenter.getCombinedSegmentation(token);
		String lemma = segmentedWord[0];
		analysis[1] = lemma;

		// factor 2 = paradigm ID (acting like POS)
		List<Paradigm> paradigms = segmenter.getParadigms(token);
		// choose the largest paradigm (the one with the most stems) for now
		if (paradigms.size() == 0) {
			analysis[2] = "NONE";
		} else {
			Paradigm largestParadigm = paradigms.get(0);
			for (Paradigm p : paradigms) {
				if (p.getStems().length > largestParadigm.getStems().length) {
					largestParadigm = p;
				}
			}
			analysis[2] = largestParadigm.getId() + "";
		}

		// TODO: How do I reasonably decide which paradigm ID to use? The
		// largest?

		// factor 3...n = selected MILE features

		// For each morpheme, see which features we can
		// associate with it, given the current paradigm(s) and then
		// fill these in. If we have conflicting features, choose
		// N/A.
		String[] featureValues = new String[featureNames.length];
		for (String morpheme : segmentedWord) {
			
			// used dummy "marked on" values since we're only using this as a lookup
			ObservedMorpheme addedMorph = new ObservedMorpheme(paradigms, morpheme, segmentedWord, false, false, false);

			for (int j = 0; j < featureNames.length; j++) {

				String featureName = featureNames[j];
				String featureValue = this.featureTable[j].get(addedMorph);

				if (featureValue != null) {
					if (featureValues[j] == null || featureValues[j].equals(featureValue)) {
						featureValues[j] = featureValue;
					} else {
						// we don't have a definite answer for this morpheme
						featureValues[j] = CONFLICT;
						nConflicts++;
					}
				}
			}
		}

		for (int j = 0; j < featureValues.length; j++) {
			if (featureValues[j] == null || featureValues[j].equals(CONFLICT)) {
				analysis[j + nNonFeatureFactors] = NA;
			} else {
				analysis[j + nNonFeatureFactors] = featureValues[j];
			}
		}

		return analysis;
	}

	public void saveTaggerData(File tagFile) throws FileNotFoundException, IOException,
			SegmenterException {

		PrintWriter out = new PrintWriter(tagFile);

		// serialize entries
		for (int i = 0; i < featureNames.length; i++) {
			String featureName = featureNames[i];
			out.println("<FEATURE>");
			out.println("<NAME>" + featureName + "</NAME>");
			for (Entry<ObservedMorpheme, String> entry : featureTable[i].entrySet()) {
				ObservedMorpheme morph = entry.getKey();
				out.print("<ENTRY>");
				out.print("<VALUE>" + entry.getValue() + "</VALUE> ");
				out.print("<MORPHEME>" + morph.morpheme + "</MORPHEME> ");
				out.print("<ME>" + morph.markedOnMe + "</ME>");
				out.print("<DEPENDENT>" + morph.markedOnMyDependent + "</DEPENDENT>");
				out.print("<GOVERNOR>" + morph.markedOnMyGovernor + "</GOVERNOR>");
				out.print("<PARADIGMS>");
				for (Paradigm p : morph.paradigms) {
					out.print(p.getId() + ",");
				}
				out.print("</PARADIGMS> ");
				out.print("<WORD>" + StringUtils.untokenize(morph.segmentedWord, " ") + "</WORD> ");
				out.println("</ENTRY>");
			}
			out.println("</FEATURE>");
		}

		out.close();
	}

	public static MorphologicalAnalyzer load(File tagFile, File segmenterFile,
			Properties segmenterProps) throws FileNotFoundException, IOException,
			ClassNotFoundException, SegmenterException {

		Segmenter segmenter = new ParaMor(segmenterProps);
		segmenter.loadSegmentationModel(segmenterFile);

		List<Paradigm> allParadigms = segmenter.getAllParadigms();

		ArrayList<String> featureNames = new ArrayList<String>();
		ArrayList<HashMap<ObservedMorpheme, String>> featureTable =
				new ArrayList<HashMap<ObservedMorpheme, String>>();
		HashMap<ObservedMorpheme, String> currentFeatureMap = null;

		BufferedReader in = new BufferedReader(new FileReader(tagFile));

		String line;
		while ((line = in.readLine()) != null) {

			if (line.startsWith("<NAME>")) {

				String featureName = StringUtils.substringBetween(line, "<NAME>", "</NAME>");
				featureNames.add(featureName);
				currentFeatureMap = new HashMap<ObservedMorpheme, String>();
				featureTable.add(currentFeatureMap);

			} else if (line.startsWith("<ENTRY>")) {

				String value = StringUtils.substringBetween(line, "<VALUE>", "</VALUE>");
				String morpheme = StringUtils.substringBetween(line, "<MORPHEME>", "</MORPHEME>");
				boolean markedOnMe =
						Boolean.parseBoolean(StringUtils.substringBetween(line, "<ME>", "</ME>"));
				boolean markedOnMyDependent =
						Boolean.parseBoolean(StringUtils.substringBetween(line, "<DEPENDENT>",
								"</DEPENDENT>"));
				boolean markedOnMyGovernor =
						Boolean.parseBoolean(StringUtils.substringBetween(line, "<GOVERNOR>",
								"</GOVERNOR>"));

				String strParadigms =
						StringUtils.substringBetween(line, "<PARADIGMS>", "</PARADIGMS>");
				String[] arrParadigms = StringUtils.tokenize(strParadigms, ",");
				int[] paradigms = StringUtils.toIntArray(arrParadigms);
				ArrayList<Paradigm> listParadigms = new ArrayList<Paradigm>(paradigms.length);
				for (int n : paradigms) {
					Paradigm p = allParadigms.get(n);
					assert p.getId() == n : "Paradigm ID mismatch";
					listParadigms.add(p);
				}

				String word = StringUtils.substringBetween(line, "<WORD>", "</WORD>");
				String[] segmentedWord = StringUtils.tokenize(word);

				ObservedMorpheme morph =
						new ObservedMorpheme(listParadigms, morpheme, segmentedWord, markedOnMe,
								markedOnMyDependent, markedOnMyGovernor);
				currentFeatureMap.put(morph, value);
			}

		}

		in.close();

		return new MorphologicalAnalyzer(featureNames.toArray(new String[featureNames.size()]),
				featureTable.toArray(new HashMap[featureTable.size()]), segmenter);
	}

	private static List<String> loadEvidenceData(File file) throws IOException {

		ArrayList<String> tokens = new ArrayList<String>();
		BufferedReader in = new BufferedReader(new FileReader(file));
		String line;
		while ((line = in.readLine()) != null) {
			tokens.addAll(Arrays.asList(line.split(" ")));
		}
		in.close();

		return tokens;
	}

	public static void main(String[] args) throws Exception {

		if (args.length != 1) {
			System.err.println("Usage: program <properties_file>");
			System.exit(1);
		}

		Properties props = PropertyUtils.getProperties(args[0]);
		SmartProperties smartProps = new SmartProperties(props);
		File segmenterData = smartProps.getPropertyFile("morphAnalyzer.segmenterSaveFile");
		File tagData = smartProps.getPropertyFile("morphAnalyzer.tagSaveFile");
		File inCorpus = smartProps.getPropertyFile("morphAnalyzer.inputCorpus");
		File outCorpus = smartProps.getPropertyFile("morphAnalyzer.factoredOutputCorpus");

		MorphologicalAnalyzer morph = load(tagData, segmenterData, props);
		List<String> evidenceTokens = loadEvidenceData(inCorpus);
		morph.segmenter.setEvidenceCorpus(evidenceTokens);

		BufferedReader in = new BufferedReader(new FileReader(inCorpus));
		PrintWriter out = new PrintWriter(outCorpus);

		// TODO: Set evidence corpus using entire corpus to be analyzed
		// TODO: Allow interface to just use a file?
		// TODO: Make sure EC is the first thing in the induction corpus

		String line;
		while ((line = in.readLine()) != null) {

			String[] tokensIn = StringUtils.tokenize(line);
			String[] tokensOut = new String[tokensIn.length];

			for (int i = 0; i < tokensIn.length; i++) {
				String[] analysis = morph.analyze(tokensIn[i]);
				tokensOut[i] = StringUtils.untokenize(analysis, "|");
			}

			out.println(StringUtils.untokenize(tokensOut));
		}

		out.close();
		in.close();
	}
}
