package edu.cmu.cs.lti.letras.tools;

import info.jonclark.util.FileUtils;
import info.jonclark.util.StringUtils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;

/**
 * Given a C structure file containing the desired sentences and a F structure
 * file containing a superset of the desired sentences, produce a file with the
 * C structure following each F structure.
 */
public class CFStructureAligner {

	private static String[] maleNames;
	private static String[] femaleNames;
	private static final String F_STRUCT_EXT = ".txt";

	/**
	 * Returns true if the word contains any lowercase characters OR is a single
	 * uppercase letter other than S OR is a single quote OR starts with a
	 * number; false otherwise.
	 */
	private static boolean isKeeper(String word) {

		if(word.length() > 0 && Character.isDigit(word.charAt(0))) {
			return true;
		} else if (word.length() == 1 && (Character.isUpperCase(word.charAt(0)) || word.charAt(0) == '\'')) {
			if (word.charAt(0) == 'S')
				return false;
			else
				return true;
		} else {
			for (int i = 0; i < word.length(); i++) {
				if (Character.isLowerCase(word.charAt(i))) {
					return true;
				}
			}
			return false;
		}
	}

	private static String replaceNames(String sentence) {
		for (String femaleName : femaleNames)
			sentence = StringUtils.replaceFast(sentence, femaleName, "ANY_FEMALE_NAME");
		for (String maleName : maleNames)
			sentence = StringUtils.replaceFast(sentence, maleName, "ANY_MALE_NAME");

		return sentence;
	}

	private static String replaceContractions(String sentence) {
		sentence = StringUtils.replaceFast(sentence, "will n't", "won't");
		sentence = StringUtils.replaceFast(sentence, " n't", "n't");
		sentence = StringUtils.replaceFast(sentence, " 's", "'s");
		sentence = StringUtils.replaceFast(sentence, "s ' ", "s' ");
		return sentence;
	}

	public static void main(String[] args) throws Exception {

		if (args.length != 5) {
			System.err.println("Usage: program <c_struct_file> <f_struct_directory> <male_names_file> <female_names_file> <output_file>");
			System.exit(1);
		}

		File cStructFile = new File(args[0]);
		File fStructDir = new File(args[1]);
		File maleNamesFile = new File(args[2]);
		File femaleNamesFile = new File(args[3]);
		File outputFile = new File(args[4]);

		HashMap<String, String> cStructs = new HashMap<String, String>();
		HashMap<String, String> cStructsLines = new HashMap<String, String>();
		HashSet<String> cStructsUnused = new HashSet<String>();

		// first read in male and female names
		maleNames = StringUtils.tokenize(FileUtils.getFileAsString(maleNamesFile), "\n");
		femaleNames = StringUtils.tokenize(FileUtils.getFileAsString(femaleNamesFile), "\n");

		// next, read in the c structs, replacing names

		BufferedReader cIn = new BufferedReader(new FileReader(cStructFile));
		int cStructFileLine = 0;
		String cLine;
		while ((cLine = cIn.readLine()) != null) {

			cStructFileLine++;

			String spaced = cLine;
			spaced = StringUtils.replaceFast(spaced, "(", " ( ");
			spaced = StringUtils.replaceFast(spaced, ")", " ) ");

			String[] sourceTokens = StringUtils.tokenize(spaced);
			ArrayList<String> remainingTokens = new ArrayList<String>(sourceTokens.length);
			for (String sourceToken : sourceTokens) {
				if (isKeeper(sourceToken)) {
					remainingTokens.add(sourceToken);
				}
			}

			String key = StringUtils.untokenize(remainingTokens);
			key = replaceNames(key);
			key = key.toLowerCase();
			key = replaceContractions(key);

			cStructs.put(key, cLine);
			cStructsLines.put(key, cStructFile.getName() + ":"
					+ StringUtils.forceNumberLength(cStructFileLine + "", 4));
			cStructsUnused.add(key);
		}

		cIn.close();

		// now try to match the c structures and f structures
		PrintWriter out = new PrintWriter(outputFile);

		int nAligned = 0;

		File[] fStructFiles = FileUtils.getFilesWithExt(fStructDir, F_STRUCT_EXT);
		for (File fStructFile : fStructFiles) {

			BufferedReader fIn = new BufferedReader(new FileReader(fStructFile));
			int fStructFileLine = 0;

			String fLine;
			StringBuilder fStruct = new StringBuilder();
			String srcSentKey = null;
			boolean firstTime = true;
			boolean inMultiply = false;

			while ((fLine = fIn.readLine()) != null) {
				fStructFileLine++;

				if (fLine.startsWith("*** START MULTIPLY ***"))
					inMultiply = true;
				else if (fLine.startsWith("*** END MULTIPLY ***"))
					inMultiply = false;

				if (!inMultiply) {

					if (fLine.equals("newpair")) {

						assert firstTime || srcSentKey != null : "For fStruct: "
								+ fStruct.toString();
						firstTime = false;

						String cStruct = cStructs.get(srcSentKey);
						if (cStruct != null) {
							cStructsUnused.remove(srcSentKey);
							out.print(fStruct.toString());

							String cStructLine = cStructsLines.get(srcSentKey);
							assert cStructLine != null;
							out.println("cstline: " + cStructLine);
							out.println("cstruct: " + cStruct + "\n\n");
							nAligned++;
						}

						// reset for next iteration
						fStruct = new StringBuilder();
						srcSentKey = null;
						fStruct.append("newpair\n" + "srcline: " + fStructFile.getName() + ":"
								+ fStructFileLine + "\n");
					} else {

						if (fLine.equals("")) {
							;
						} else if (fLine.startsWith("(")) {
							fStruct.append("fstruct: " + fLine + "\n");
						} else {
							fStruct.append(fLine + "\n");
						}

						if (fLine.startsWith("srcsent:")) {
							String value = StringUtils.substringAfter(fLine, "srcsent: ").trim();
							value = replaceNames(value);
							value = value.toLowerCase();

							// remove punctuation
							value = StringUtils.replaceFast(value, new String[] { ".", "?", "!", "\"", "," },
									new String[] { "", "", "", "", "" });
							srcSentKey = value;

							// System.out.println(value);
						}
					}
				} // end !inMultiply
			}

			// try to match the final one
			if (srcSentKey != null) {
				String cStruct = cStructs.get(srcSentKey);
				if (cStruct != null) {
					cStructsUnused.remove(srcSentKey);
					out.print(fStruct.toString());

					String cStructLine = cStructsLines.get(srcSentKey);
					assert cStructLine != null;
					out.println("cstline: " + cStructLine);
					out.println("cstruct: " + cStruct + "\n\n");
					nAligned++;
				}
			}

			fIn.close();
		}
		out.close();

		// print out the unused c-structs in the order of the file and line on
		// which they occurred.
		System.out.println("\n\n\n\n\n\n\n\n\n\nUNUSED C STRUCTS: ");
		ArrayList<String> unusedList = new ArrayList<String>(cStructsUnused.size());
		for (String cStruct : cStructsUnused)
			unusedList.add(cStructsLines.get(cStruct) + ": " + cStruct);
		Collections.sort(unusedList);
		for (String cStruct : unusedList)
			System.out.println(cStruct);

		System.out.println(cStructsUnused.size() + " unused c structs remaining.");
		System.out.println(nAligned + " successfully aligned.");
	}
}
