package edu.cmu.cs.lti.avenue.navigation.tools;

import info.jonclark.properties.SmartProperties;
import info.jonclark.util.FileUtils;
import info.jonclark.util.StringUtils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;

import edu.cmu.cs.lti.avenue.corpus.Corpus;
import edu.cmu.cs.lti.avenue.corpus.SentencePair;
import edu.cmu.cs.lti.avenue.corpus.Serializer;

public class ContextAligner {

	private static class ContextList {
		private ArrayList<String> list = new ArrayList<String>();
		private int next;

		public void add(String context) {
			list.add(context);
		}

		public boolean hasNext() {
			return next < list.size();
		}

		public String next() {
			return list.get(next++);
		}

		public int size() {
			return list.size();
		}
	}

	public static void main(String[] args) throws Exception {
		if (args.length != 1) {
			System.err.println("Usage: program <properties_file>");
			System.exit(1);
		}

		SmartProperties props = new SmartProperties(args[0]);

		String encoding = props.getPropertyString("global.encoding");
		File inFile = props.getPropertyFile("paths.translatedCorpus");
		File contextFile = props.getPropertyFile("paths.finalDeliverableContextCorpus");
		File maleFile = props.getPropertyFile("paths.maleNamesFile");
		File femaleFile = props.getPropertyFile("paths.femaleNamesFile");
		File outFile = props.getPropertyFile("paths.translatedCorpusWithContext");

		String[] maleNames =
				StringUtils.tokenize(FileUtils.getFileAsString(maleFile).toLowerCase(), "\n");
		String[] femaleNames =
				StringUtils.tokenize(FileUtils.getFileAsString(femaleFile).toLowerCase(), "\n");

		HashMap<String, ContextList> contexts = loadContexts(contextFile, maleNames, femaleNames);

		int nUnmatched = 0;
		int nExhausted = 0;
		Corpus corpus = Serializer.loadSentencePairs(inFile, encoding);
		for (final SentencePair pair : corpus.getSentences()) {

			String srcSent = pair.getDisplaySourceSentence();
			srcSent = srcSent.toLowerCase();
			srcSent = CFStructureAligner.replaceNamesIgnoreGender(srcSent, maleNames, femaleNames);

			// remove punctuation
			String srcSentKey = StringUtils.removeCharsInCharList(srcSent, ".?!");

			ContextList contextList = contexts.get(srcSentKey);
			if (contextList == null) {
				nUnmatched++;
				System.out.println("No context found for sentence: " + srcSent);
			} else {
				if (contextList.hasNext()) {
					pair.setContext(contextList.next());
				} else {
					nExhausted++;
					System.out.println("Context list exhausted for sentence: " + srcSent
							+ "(size: " + contextList.size() + ")");
					String topContext = contextList.list.get(0);
					pair.setContext(topContext);
					System.out.println("WARNING: Backing off to top context: " + topContext);
				}
			}
		}

		System.out.println(nExhausted + " sentences exhausted the context list.");
		System.out.println(nUnmatched + " sentences could not be matched to their context.");
		Serializer.saveSentencePairs(corpus, outFile);
		System.out.println("WROTE MERGED TRANSLATED CORPUS WITH CONTEXT ADDED: "
				+ outFile.getAbsolutePath());
	}

	private static HashMap<String, ContextList> loadContexts(File contextFile, String[] maleNames,
			String[] femaleNames) throws FileNotFoundException, IOException {

		HashMap<String, ContextList> contexts = new HashMap<String, ContextList>();
		BufferedReader contextIn = new BufferedReader(new FileReader(contextFile));

		String line;
		String srcSent = "";
		while ((line = contextIn.readLine()) != null) {

			if (line.matches("[0-9]+:.*")) {
				srcSent = StringUtils.substringAfter(line, ":").trim();
				srcSent = srcSent.toLowerCase();
				srcSent =
						CFStructureAligner.replaceNamesIgnoreGender(srcSent, maleNames, femaleNames);
				srcSent = StringUtils.untokenize(StringUtils.tokenize(srcSent));

				// remove punctuation
				srcSent = StringUtils.removeCharsInCharList(srcSent, ".?!");

			} else if (line.startsWith("context:")) {
				String context = StringUtils.substringAfter(line, "context:").trim();
				ContextList list;
				if (contexts.containsKey(srcSent)) {
					list = contexts.get(srcSent);
				} else {
					list = new ContextList();
					contexts.put(srcSent, list);
				}
				list.add(context);
			}
		}

		contextIn.close();
		return contexts;
	}
}
