/**
 * The AVENUE Project
 * Language Technologies Institute
 * School of Computer Science
 * (c) 2007 Carnegie Mellon University
 * 
 * Corpus Navigator
 * Written by Jonathan Clark
 */
package edu.cmu.cs.lti.avenue.corpus;

import info.jonclark.lang.Pair;
import info.jonclark.util.DebugUtils;
import info.jonclark.util.StringUtils;

import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;

import edu.cmu.cs.lti.avenue.trees.smart.SmartTree;
import edu.cmu.cs.lti.avenue.trees.smart.TreeNode;

/**
 * Represents a single sentence pair that either will or has been presented to
 * the user during elicitation.
 */
public class SentencePair implements Comparable<SentencePair> {

	private int id;
	private String[] eSentence;
	private String[] fSentence;
	private String[] eSentenceNormalized;
	private String[] fSentenceNormalized;
	private boolean[] sourceMorphemeBoundaries;
	private boolean[] targetMorphemeBoundaries;
	private Alignment alignment;
	private Alignment normalizedAlignment;
	private String context;
	private String comment;
	private boolean alternate;

	private final Retokenizer tok;

	// file:line pairs where components of this sentence pair originated (for
	// debugging)
	private String fstLine;
	private String cstLine;
	private String tgtLine;
	private String myLine;

	private SmartTree featureStructure;
	private SmartTree sourceConstituentStructure;
	private SmartTree targetConstituentStructure;
	private PhiPlusMapping phiPlusMapping;

	private ArrayList<SentencePair> alternates = new ArrayList<SentencePair>(1);

	protected SentencePair(int id, String[] eSentence, String[] fSentence,
			boolean[] sourceMorphemeBoundaries, boolean[] targetMorphemeBoundaries,
			String alignments, String context, String comment, SmartTree featureStructure,
			SmartTree sourceConstituentStructure, SmartTree targetConstituentStructure,
			PhiPlusMapping phiPlusMapping, String fstLine, String cstLine, String myLine)
			throws ParseException, CorpusException, IOException {

		// assert id != -1;
		assert eSentence != null : "null sentence";
		assert fSentence != null : "null sentence2";
		assert alignments != null : "null alignments";
		assert context != null : "null context";
		assert comment != null : "null comment";

		assert eSentence.length == sourceMorphemeBoundaries.length : "source token-morphemeBoundary mismatch";
		assert fSentence.length == targetMorphemeBoundaries.length : "target token-morphemeBoundary mismatch";

		// create the tokenizer
		this.tok = new Retokenizer();

		// preliminarily set sentence values
		this.eSentence = eSentence;
		this.fSentence = fSentence;
		this.eSentenceNormalized = eSentence;
		this.fSentenceNormalized = fSentence;

		this.id = id;
		setDisplaySourceSentence(eSentence, sourceMorphemeBoundaries, alignments);
		setDisplayTargetSentence(fSentence, targetMorphemeBoundaries, this.alignment.toString());

		// this is now taken care of inside setSourceSentence and
		// setTargetSentence
		// this.alignment = new Alignment(alignments, eSentence.length,
		// fSentence.length);
		this.context = context;
		this.comment = comment;
		this.featureStructure = featureStructure;
		this.sourceConstituentStructure = sourceConstituentStructure;
		this.targetConstituentStructure = targetConstituentStructure;
		this.phiPlusMapping = phiPlusMapping;

		this.fstLine = fstLine;
		this.cstLine = cstLine;
		this.myLine = myLine;

		// do some sanity checks
		if (this.sourceConstituentStructure != null) {
			ArrayList<TreeNode> terminals = this.sourceConstituentStructure.getTerminalNodes();

			if (terminals.size() != eSentenceNormalized.length) {
				throw new CorpusException(
						"Constituent structure terminals and source sentence have different lengths ("
								+ terminals.size() + " and " + eSentenceNormalized.length + "): "
								+ this.sourceConstituentStructure.toString() + " :: "
								+ StringUtils.untokenize(this.eSentenceNormalized));
			}

			// for (int i = 0; i < terminals.size(); i++) {
			// assert eSentence[i].equals(terminals.get(i).getValues().get(1)) :
			// "c-structure to source sentence mismatch for SentencePair at "
			// + myLine
			// + " for word at position "
			// + i
			// + ": "
			// + eSentence[i]
			// + " vs "
			// + terminals.get(i).getValues().get(1);
			// }
		}
	}

	public boolean equals(Object obj) {
		return (this == obj);
		// if (this == obj)
		// return true;
		// else if (obj instanceof SentencePair) {
		// SentencePair other = (SentencePair) obj;
		// if (this.id == -1 || other.id == -1)
		// throw new RuntimeException("ID not assigned!");
		// return (this.id == other.id);
		// } else {
		// return false;
		// }
	}

	public int hashCode() {
		assert id != -1 : "No ID has been assigned.";
		return id;
	}

	public boolean hasEqualTranslation(SentencePair other) {
		if (this.fSentence.length != other.fSentence.length) {
			return false;
		} else {
			for (int i = 0; i < fSentence.length; i++) {
				if (!this.fSentence[i].equals(other.fSentence[i]))
					return false;
			}

			return true;
		}
	}

	/**
	 * Flips the source and target sentences and their alignments
	 * 
	 * @throws ParseException
	 */
	public void flip() throws ParseException {
		String[] temp = eSentence;
		eSentence = fSentence;
		fSentence = temp;

		StringUtils.internTokens(eSentence);
		this.alignment =
				new Alignment(alignment.transpose().toString(), this.eSentence.length,
						this.fSentence.length);

		normalize();
		assertMorphemeBoundarySanity();
	}

	/**
	 * @return An array of length (getDisplaySourceSentence().length) where
	 *         array entry i indicates that entry i is a morpheme that should
	 *         attach to the PREVIOUS morpheme. That is, when boundaries[i] ==
	 *         true, then sentence[i-1] and sentence[i] were separated by a "+"
	 *         in the input.
	 */
	public boolean[] getSourceMorphemeBoundaries() {
		return sourceMorphemeBoundaries;
	}

	public boolean[] getTargetMorphemeBoundaries() {
		return targetMorphemeBoundaries;
	}

	public String getDisplaySourceSentence() {
		return untokenizeSentence(eSentence, sourceMorphemeBoundaries);
	}
	
	public String[] getDisplaySourceTokens() {
		return eSentence;
	}

	public String[] getNormalizedSourceTokens() {
		return eSentenceNormalized;
	}

	public void setDisplaySourceSentence(String sentence) throws ParseException {
		Pair<String[], boolean[]> pair = tokenizeSentence(sentence);
		setDisplaySourceSentence(pair.first, pair.second, this.normalizedAlignment.toString());
		assertMorphemeBoundarySanity();
	}

	private void setDisplaySourceSentence(String[] sentence, boolean[] morphemeBoundaries,
			String alignments) throws ParseException {

		eSentence = sentence;
		sourceMorphemeBoundaries = morphemeBoundaries;
		StringUtils.internTokens(eSentence);
		this.alignment = new Alignment(alignments, this.eSentence.length, this.fSentence.length);

		normalize();
	}

	private void normalize() throws ParseException {

		// resolve circular dependency
		fSentenceNormalized = fSentence;

		// first, normalize eSentence, based on original alignments
		// tokenize punctuation and lowercase the sentence
		String[] eLowercase = StringUtils.tokenize(StringUtils.untokenize(eSentence).toLowerCase());
		Pair<String[], String> p1 = tok.retokenize(eLowercase, alignment.toString(), false);
		eSentenceNormalized = p1.first;
		normalizedAlignment =
				new Alignment(p1.second, eSentenceNormalized.length, fSentenceNormalized.length);
		StringUtils.internTokens(eSentenceNormalized);

		// now, normalize fSentence, based on newly created alignments
		// tokenize punctuation and lowercase the sentence
		String[] fLowercase = StringUtils.tokenize(StringUtils.untokenize(fSentence).toLowerCase());
		Pair<String[], String> p2 =
				tok.retokenize(fLowercase, normalizedAlignment.toString(), true);
		this.fSentenceNormalized = p2.first;
		normalizedAlignment =
				new Alignment(p2.second, this.eSentenceNormalized.length,
						this.fSentenceNormalized.length);

		StringUtils.internTokens(fSentenceNormalized);
		assertAlignmentSanity();
	}

	private void assertMorphemeBoundarySanity() {
		assert sourceMorphemeBoundaries.length == eSentence.length;
		assert targetMorphemeBoundaries.length == fSentence.length;
	}

	private void assertAlignmentSanity() {
		if (DebugUtils.isAssertEnabled()) {
			// first, check display alignments for bounds
			RawAlignment[] displayAlignments = alignment.getRawAlignments();
			for (final RawAlignment a : displayAlignments) {
				assertBounds(a.sourceTerminals, eSentence.length, "eSentence");
				assertBounds(a.targetTerminals, fSentence.length, "fSentence");
			}

			// now, check normalized alignments for bounds
			RawAlignment[] normalizedAlignments = alignment.getRawAlignments();
			for (final RawAlignment a : normalizedAlignments) {
				assertBounds(a.sourceTerminals, eSentenceNormalized.length, "eSentenceNormalized");
				assertBounds(a.targetTerminals, fSentenceNormalized.length, "fSentenceNormalized");
			}
		}
	}

	public void assertConstituentStructureSanity() {
		if (DebugUtils.isAssertEnabled()) {
			if (sourceConstituentStructure != null) {
				ArrayList<TreeNode> sourceNodes = sourceConstituentStructure.getTerminalNodes();
				assert sourceNodes.size() == eSentenceNormalized.length : "source c-structure to eSentenceNomralized length mismatch";
			}
			if (targetConstituentStructure != null) {
				ArrayList<TreeNode> targetNodes = targetConstituentStructure.getTerminalNodes();
				assert targetNodes.size() == fSentenceNormalized.length : "source c-structure to fSentenceNomralized length mismatch";
			}
		}
	}

	private void assertBounds(int[] array, int lengthBound, String errorTrace) {
		for (final int n : array) {
			assert n >= 1 && n <= lengthBound : "1-based alignment index " + n
					+ " out of bounds for " + errorTrace + " which has length " + lengthBound;
		}
	}
	
	public String getDisplayTargetSentence() {
		return untokenizeSentence(fSentence, targetMorphemeBoundaries);
	}

	public String[] getDisplayTargetTokens() {
		return fSentence;
	}

	public String[] getNormalizedTargetTokens() {
		return fSentenceNormalized;
	}

	public void setDisplayTargetSentence(String sentence) throws ParseException {
		Pair<String[], boolean[]> pair = tokenizeSentence(sentence);
		setDisplayTargetSentence(pair.first, pair.second, this.normalizedAlignment.toString());
		assertMorphemeBoundarySanity();
	}

	private void setDisplayTargetSentence(String[] sentence, boolean[] morphemeBoundaries,
			String alignments) throws ParseException {

		fSentence = sentence;
		targetMorphemeBoundaries = morphemeBoundaries;
		StringUtils.internTokens(fSentence);
		normalize();
	}

	public Alignment getDisplayAlignment() {
		return alignment;
	}

	public Alignment getNormalizedAlignment() {
		return normalizedAlignment;
	}

	public void setAlignments(Alignment alignment) throws ParseException {
		this.alignment = alignment;

		// get normalized alignment
		setDisplaySourceSentence(this.eSentence, this.sourceMorphemeBoundaries,
				alignment.toString());
	}

	public String getContext() {
		return context;
	}

	public void setContext(String context) {
		this.context = context;
	}

	public String getComment() {
		return comment;
	}

	public void setComment(String comment) {
		this.comment = comment;
	}

	public int getId() {
		return id;
	}

	public void setId(int id) {
		this.id = id;
	}

	public SmartTree getFeatureStructure() {
		return featureStructure;
	}

	public void setSourceConstituentStructure(SmartTree tree) {
		this.sourceConstituentStructure = tree;
	}

	public SmartTree getSourceConstituentStructure() {
		return sourceConstituentStructure;
	}

	public void setTargetConstituentStructure(SmartTree tree) {
		this.targetConstituentStructure = tree;
	}

	public SmartTree getTargetConstituentStructure() {
		return targetConstituentStructure;
	}

	public PhiPlusMapping getPhiPlusMapping() {
		return phiPlusMapping;
	}

	public void setPhiPlusMapping(PhiPlusMapping mapping) {
		this.phiPlusMapping = mapping;
	}

	public void setTargetSentenceSourceLine(String tgtLine) {
		this.tgtLine = tgtLine;
	}

	/**
	 * Gets the file and line number where the target sentence originated.
	 * 
	 * @return
	 */
	public String getTargetSentenceSourceLine() {
		return tgtLine;
	}

	/**
	 * Gets the file and line number where the feature structure originated.
	 * 
	 * @return
	 */
	public String getFeatureSourceLine() {
		return fstLine;
	}

	/**
	 * Gets the file and line number where the constituent structure originated.
	 * 
	 * @return
	 */
	public String getConstituentSourceLine() {
		return cstLine;
	}

	public String getMyLine() {
		return myLine;
	}

	public void addAlternate(SentencePair pair) {
		assert !isAlternate() : "Only non-alternate sentences can have child alternates.";
		alternates.add(pair);
		pair.setAlternate(true);
	}

	public ArrayList<SentencePair> getAlternates() {
		return alternates;
	}

	public boolean isAlternate() {
		return alternate;
	}

	public void setAlternate(boolean b) {
		this.alternate = b;
	}

	public int compareTo(SentencePair other) {
		assert id != -1 : "No ID has been assigned.";
		return this.id - other.id;
	}

	public String toString() {
		return StringUtils.untokenize(eSentence) + " / " + StringUtils.untokenize(fSentence);
	}

	public static Pair<String[], boolean[]> tokenizeSentence(String sentence) {
		String preTok = StringUtils.replaceFast(sentence, "+", " +");
		String[] tokens = StringUtils.tokenize(preTok);
		boolean[] morphemeBoundaries = new boolean[tokens.length];
		for (int i = 0; i < tokens.length; i++) {
			if (tokens[i].startsWith("+")) {
				tokens[i] = tokens[i].substring(1);
				morphemeBoundaries[i] = true;
			}
		}
		return new Pair<String[], boolean[]>(tokens, morphemeBoundaries);
	}

	public static String untokenizeSentence(String[] tokens, boolean[] morphemeBoundaries) {

		assert tokens.length == morphemeBoundaries.length : "array length mismatch";

		StringBuilder builder = new StringBuilder();
		for (int i = 0; i < tokens.length; i++) {

			if (i > 0) {
				if (morphemeBoundaries[i]) {
					builder.append("+");
				} else {
					builder.append(" ");
				}
			}

			builder.append(tokens[i]);
		}
		return builder.toString();
	}

	public String serialize() {

		assertMorphemeBoundarySanity();
		StringBuilder builder = new StringBuilder();

		// NOTE: myLine is not printed since it will change next time the file
		// is read in

		builder.append("newpair\n");
		if (id != -1)
			builder.append("sentid#: " + id + "\n");
		if (fstLine != null)
			builder.append("fstline: " + fstLine + "\n");
		if (eSentence != null)
			builder.append("srcsent: " + untokenizeSentence(eSentence, sourceMorphemeBoundaries)
					+ "\n");
		if (fSentence != null)
			builder.append("tgtsent: " + untokenizeSentence(fSentence, targetMorphemeBoundaries)
					+ "\n");
		if (alignment != null)
			builder.append("aligned: " + alignment + "\n");
		if (context != null)
			builder.append("context: " + context + "\n");
		if (comment != null)
			builder.append("comment: " + comment + "\n");
		if (featureStructure != null)
			builder.append("fstruct: " + featureStructure + "\n");
		if (cstLine != null)
			builder.append("cstline: " + cstLine + "\n");
		if (sourceConstituentStructure != null)
			builder.append("cstruct: " + sourceConstituentStructure + "\n");
		if (targetConstituentStructure != null)
			builder.append("cstrtgt: " + targetConstituentStructure + "\n");
		if (phiPlusMapping != null)
			builder.append("phiplus: " + phiPlusMapping + "\n");
		if (alternate)
			builder.append("alternate");

		for (final SentencePair alternate : alternates) {
			builder.append("\n" + alternate.serialize());
		}

		return builder.toString();

	}
}
