/**
 * The AVENUE Project
 * Language Technologies Institute
 * School of Computer Science
 * (c) 2007 Carnegie Mellon University
 * 
 * Corpus Navigator
 * Written by Jonathan Clark
 */
package edu.cmu.cs.lti.avenue.corpus;

import info.jonclark.lang.Pair;
import info.jonclark.log.LogUtils;
import info.jonclark.util.StringUtils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.logging.Logger;

import edu.cmu.cs.lti.avenue.navigation.featuredetection.deductive.FeatureManager;
import edu.cmu.cs.lti.avenue.navigation.featuredetection.deductive.Rule;
import edu.cmu.cs.lti.avenue.navigation.search.generation0.featurebitmaps.Feature;
import edu.cmu.cs.lti.avenue.navigation.search.generation0.featurebitmaps.FeatureFactory;
import edu.cmu.cs.lti.avenue.navigation.search.generation0.featurebitmaps.FeatureGroup;
import edu.cmu.cs.lti.avenue.trees.smart.SmartTree;
import edu.cmu.cs.lti.avenue.trees.smart.SmartTree.LabelMode;

public class Serializer {

	private static final Logger log = LogUtils.getLogger();

	public static void saveSentencePairs(Corpus corpus, File outFile) throws FileNotFoundException,
			UnsupportedEncodingException {

		PrintWriter out = new PrintWriter(outFile, corpus.encoding);

		out.println("encoding: " + corpus.encoding);
		out.println("srclang: " + corpus.srcLang);
		out.println("tgtlang: " + corpus.tgtLang);

		for (final SentencePair pair : corpus.getSentences()) {
			out.println(pair.serialize());
			out.println();
		}
		out.close();
	}

	public static Corpus loadSentencePairs(File elicitationCorpusFile, String charset)
			throws IOException, ParseException {
		return loadSentencePairs(elicitationCorpusFile, false, charset);
	}

	public static Corpus loadSentencePairs(File elicitationCorpusFile, boolean assignIds,
			String charset) throws IOException, ParseException {

		Corpus corpus = new Corpus();

		// TODO: Respect the encoding specified at the top of the file
		// TODO: Handle other possible tags besides newpair -- read header info

		BufferedReader in =
				new BufferedReader(new InputStreamReader(
						new FileInputStream(elicitationCorpusFile), charset));
		int nLine = 0;

		// advance to the first pair and gather header info
		String corpusLine;
		while ((corpusLine = in.readLine()) != null && !(corpusLine.trim().equals("newpair"))) {
			nLine++;
			corpusLine = corpusLine.trim();

			if (corpusLine.startsWith("encoding")) {
				// WARNING: This is currently ignored by the loader
				corpus.encoding = StringUtils.substringAfter(corpusLine, "encoding:").trim();
			} else if (corpusLine.startsWith("srclang:")) {
				corpus.srcLang = StringUtils.substringAfter(corpusLine, "srclang:").trim();
			} else if (corpusLine.startsWith("tgtlang:")) {
				corpus.tgtLang = StringUtils.substringAfter(corpusLine, "tgtlang:").trim();
			}
		}

		if (corpus.encoding == null) {
			corpus.encoding = "UTF8";
		}

		if (corpus.srcLang == null) {
			corpus.srcLang = "eliciting";
		}

		if (corpus.srcLang == null) {
			corpus.tgtLang = "elicited";
		}

		int seqNum = 0;

		// read file to exhaustion
		while (corpusLine != null) {

			try {

				seqNum++;
				nLine++;

				int id = -1;
				String[] eSentence = null;
				String[] fSentence = new String[0];
				boolean[] sourceMorphemeBoundaries = null;
				boolean[] targetMorphemeBoundaries = null;
				String alignments = "";
				String context = "";
				String comment = "";
				SmartTree featureStructure = null;
				SmartTree sourceConstituentStructure = null;
				SmartTree targetConstituentStructure = null;
				PhiPlusMapping phiPlusMapping = null;
				String fstLine = null;
				String cstLine = null;
				String myLine = elicitationCorpusFile + ":" + nLine;
				boolean alternate = false;

				if (assignIds) {
					id = seqNum;
				}

				while ((corpusLine = in.readLine()) != null
						&& !(corpusLine.trim().equals("newpair"))) {

					nLine++;
					corpusLine = corpusLine.trim();

					if (corpusLine.equals("")) {
						continue;
					} else if (corpusLine.startsWith("(")) {
						featureStructure =
								SmartTree.parse(corpusLine.trim(), "f", LabelMode.LABEL_ODD_NODES);
					} else {

						String value = StringUtils.substringAfter(corpusLine, ":");
						value = value.trim();

						// System.out.println("reading: " + corpusLine);

						if (corpusLine.startsWith("sentid#")) {
							id = Integer.parseInt(value);
						} else if (corpusLine.startsWith("srcsent")) {
							Pair<String[], boolean[]> pair = SentencePair.tokenizeSentence(value);
							eSentence = pair.first;
							sourceMorphemeBoundaries = pair.second;
						} else if (corpusLine.startsWith("tgtsent")) {
							Pair<String[], boolean[]> pair = SentencePair.tokenizeSentence(value);
							fSentence = pair.first;
							targetMorphemeBoundaries = pair.second;
						} else if (corpusLine.startsWith("aligned")) {
							alignments = value;
						} else if (corpusLine.startsWith("context")) {
							context = value;
						} else if (corpusLine.startsWith("comment")) {
							comment = value;
						} else if (corpusLine.startsWith("fstruct")) {
							featureStructure =
									SmartTree.parse(value.trim(), SmartTree.F_STRUCT_LABEL,
											LabelMode.LABEL_ODD_NODES);
						} else if (corpusLine.startsWith("cstruct")) {
							sourceConstituentStructure =
									SmartTree.parse(value.trim(), SmartTree.SOURCE_C_STRUCT_LABEL,
											LabelMode.LABEL_ALL_NODES);
						} else if (corpusLine.startsWith("cstrtgt")) {
							targetConstituentStructure =
									SmartTree.parse(value.trim(), SmartTree.TARGET_C_STRUCT_LABEL,
											LabelMode.LABEL_ALL_NODES);
						} else if (corpusLine.startsWith("phiplus")) {
							phiPlusMapping = PhiPlusMapping.deserialize(value);
						} else if (corpusLine.startsWith("fstline")) {
							fstLine = value;
						} else if (corpusLine.startsWith("cstline")) {
							cstLine = value;
						} else if (corpusLine.startsWith("alternate")) {
							alternate = true;
						} else {
							log.warning("Unknown tag in elicitation corpus ("
									+ elicitationCorpusFile.getName() + ":" + nLine + "): "
									+ corpusLine);
						}
					}
				}

				// any of these being null means we have an incomplete entry (or
				// the
				// file ended in the middle of an entry)
				// assert id != -1;
				if (eSentence == null)
					throw new ParseException("Error while parsing file (null eSentence): "
							+ elicitationCorpusFile + ":" + nLine, nLine);

				SentencePair sentencePair =
						SentencePairFactory.getInstance(id, eSentence, fSentence,
								sourceMorphemeBoundaries, targetMorphemeBoundaries, alignments,
								context, comment, featureStructure, sourceConstituentStructure,
								targetConstituentStructure, phiPlusMapping, fstLine, cstLine,
								myLine);

				if (alternate) {
					SentencePair prev = corpus.getSentences().get(corpus.getSentences().size() - 1);
					prev.addAlternate(sentencePair);
				} else {
					corpus.addSentence(sentencePair);
				}

			} catch (Exception e) {
				throw new ParseException("Error while parsing file " + elicitationCorpusFile + ":"
						+ nLine + "\n" + StringUtils.getStackTrace(e), nLine);
			}

		} // end if not in multiply

		in.close();

		return corpus;
	}

	public static ArrayList<FeatureGroup<Feature>> loadImplicationalUniversals(File universalsFile,
			double minProb) throws NumberFormatException, IOException {

		ArrayList<FeatureGroup<Feature>> implicationalUniverals =
				new ArrayList<FeatureGroup<Feature>>();

		BufferedReader in = new BufferedReader(new FileReader(universalsFile));

		String line;
		while ((line = in.readLine()) != null) {

			String[] tokens = StringUtils.split(line, "\t", 11);

			double prob = Double.parseDouble(tokens[0]);
			if (prob > minProb) {
				String strTrigger = tokens[1] + ": " + tokens[3];
				String strImplication = tokens[2] + ": " + tokens[4];

				Feature trigger = FeatureFactory.getInstance(strTrigger);
				Feature implication = FeatureFactory.getInstance(strImplication);

				implicationalUniverals.add(new FeatureGroup<Feature>(trigger, implication));
			}
		}

		return implicationalUniverals;
	}

	/**
	 * Load feature groups into an array list, enclosing each in a feature group
	 * that represents all features that either trigger the test or are fired by
	 * the test.
	 * 
	 * @param sentencePairs
	 * @return
	 * @throws IOException
	 */
	public static ArrayList<FeatureGroup<SentenceTest>> loadSentenceTests(File testsFile,
			HashSet<SentencePair> sentencePairs) throws IOException {
		ArrayList<FeatureGroup<SentenceTest>> sentenceTests =
				new ArrayList<FeatureGroup<SentenceTest>>();

		BufferedReader in = new BufferedReader(new FileReader(testsFile));

		// TODO: Handle other possible tags besides newtest
		String testLine;
		while ((testLine = in.readLine()) != null && !testLine.equals("newtest"))
			;

		while (testLine != null) {

			// send a group of sentences (1 for now)
			ArrayList<Feature> ifEqual = null;
			ArrayList<Feature> ifNotEqual = null;
			int[] required = null;
			while ((testLine = in.readLine()) != null && !(testLine.equals(""))) {

				String value = StringUtils.substringAfter(testLine, ":");
				value = value.trim();

				// System.err.println("reading: " + testLine);

				if (testLine.startsWith("newtest")) {
					continue;
				} else if (testLine.startsWith("ifequal")) {

					String[] features = StringUtils.tokenize(value, ",");
					ifEqual = new ArrayList<Feature>(features.length);
					for (String feature : features) {
						ifEqual.add(FeatureFactory.getInstance(feature.trim()));
					}

				} else if (testLine.startsWith("ifnoteq")) {

					String[] features = StringUtils.tokenize(value, ",");
					ifNotEqual = new ArrayList<Feature>(features.length);
					for (String feature : features) {
						ifNotEqual.add(FeatureFactory.getInstance(feature.trim()));
					}

				} else if (testLine.startsWith("require")) {
					required = StringUtils.toIntArray(StringUtils.tokenize(value));
				} else {
					log.warning("Unknown tag in elicitation corpus: " + testLine);
				}
			}

			// any of these being null means we have an incomplete entry (or the
			// file ended in the middle of an entry)
			assert ifEqual != null;
			assert ifNotEqual != null;
			assert required != null;

			SentenceTest test = new SentenceTest(required, ifEqual, ifNotEqual);

			ArrayList<Feature> allFeatures =
					new ArrayList<Feature>(ifEqual.size() + ifNotEqual.size());
			allFeatures.addAll(ifEqual);
			allFeatures.addAll(ifNotEqual);

			FeatureGroup<SentenceTest> node = new FeatureGroup<SentenceTest>(allFeatures, test);
			sentenceTests.add(node);
		} // end while corpusLine != null

		in.close();

		return sentenceTests;
	}

	public static ArrayList<Rule> loadRules(File rulesFile, FeatureManager featureMan)
			throws IOException, ParseException {

		ArrayList<Rule> rules = new ArrayList<Rule>();
		BufferedReader in = new BufferedReader(new FileReader(rulesFile));

		String ruleBeginLine = null;
		int nOpen = 0;
		int nLine = 0;
		boolean emptyBuffer = true;
		StringBuilder serializedRule = new StringBuilder();

		try {
			String line;
			while ((line = in.readLine()) != null) {
				nLine++;

				if (line.trim().length() == 0)
					continue;

				if (!line.startsWith("#")) {
					serializedRule.append(line + "\n");

					// count ( and ) in each line, create a rule each time we
					// hit
					// zero
					int nNewOpen = StringUtils.countOccurances(line, '(');
					int nNewClose = StringUtils.countOccurances(line, ')');

					if (line.startsWith("(rule")) {
						if (ruleBeginLine == null) {
							ruleBeginLine = rulesFile.getAbsolutePath() + ":" + nLine;
						} else {
							throw new ParseException(
									"Unfinished rule starting at " + ruleBeginLine, nLine);
						}
					}

					if (nNewOpen > 0)
						emptyBuffer = false;

					nOpen = nOpen + nNewOpen - nNewClose;

					if (!emptyBuffer && nOpen == 0) {
						Rule rule = Rule.createRule(serializedRule.toString(), featureMan);
						rules.add(rule);

						serializedRule = new StringBuilder();
						ruleBeginLine = null;
						emptyBuffer = true;
					}
				}
			}

			if (nOpen > 0) {
				if (ruleBeginLine == null) {
					ruleBeginLine = rulesFile.getAbsolutePath() + ":" + nLine;
				} else {
					throw new ParseException("Unfinished rule starting at " + ruleBeginLine, nLine);
				}
			}
		} catch (ParseException e) {
			throw new ParseException("ParseException while reading rule at " + ruleBeginLine + "\n"
					+ StringUtils.getStackTrace(e), -1);
		}

		in.close();

		return rules;
	}

	public static ArrayList<String> getTreeStringsFromFile(File treesFile, String treeStart,
			String[] commentLinesStartWith) throws IOException, ParseException {

		ArrayList<String> trees = new ArrayList<String>();
		BufferedReader in = new BufferedReader(new FileReader(treesFile));

		String ruleBeginLine = null;
		int nOpen = 0;
		int nLine = 0;
		boolean emptyBuffer = true;
		StringBuilder serializedRule = new StringBuilder();

		try {
			String line;
			while ((line = in.readLine()) != null) {
				nLine++;

				if (line.trim().length() == 0)
					continue;

				boolean isComment = false;
				for (final String commentStart : commentLinesStartWith) {
					if (line.startsWith(commentStart)) {
						isComment = true;
						;
					}
				}

				if (!isComment) {
					serializedRule.append(line + "\n");

					// count ( and ) in each line, create a rule each time we
					// hit
					// zero
					int nNewOpen = StringUtils.countOccurances(line, '(');
					int nNewClose = StringUtils.countOccurances(line, ')');

					if (line.startsWith(treeStart)) {
						if (ruleBeginLine == null) {
							ruleBeginLine = treesFile.getAbsolutePath() + ":" + nLine;
						} else {
							throw new ParseException(
									"Unfinished tree starting at " + ruleBeginLine, nLine);
						}
					}

					if (nNewOpen > 0)
						emptyBuffer = false;

					nOpen = nOpen + nNewOpen - nNewClose;

					if (!emptyBuffer && nOpen == 0) {
						trees.add(serializedRule.toString());

						serializedRule = new StringBuilder();
						ruleBeginLine = null;
						emptyBuffer = true;
					}
				}
			}
		} catch (ParseException e) {
			throw new ParseException("ParseException while reading rule at " + ruleBeginLine + "\n"
					+ StringUtils.getStackTrace(e), -1);
		}

		in.close();

		return trees;
	}
}
