/**
 * The AVENUE Project
 * Language Technologies Institute
 * School of Computer Science
 * (c) 2007 Carnegie Mellon University
 * 
 * Corpus Navigator
 * Written by Jonathan Clark
 */
package edu.cmu.cs.lti.avenue.navigation.tools;

import info.jonclark.lang.Pair;
import info.jonclark.log.LogUtils;
import info.jonclark.properties.SmartProperties;
import info.jonclark.util.FileUtils;
import info.jonclark.util.StringUtils;
import it.unimi.dsi.fastutil.objects.Object2ObjectArrayMap;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.PrintWriter;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.logging.Logger;

import edu.cmu.cs.lti.avenue.corpus.Retokenizer;
import edu.cmu.cs.lti.avenue.featurespecification.FeatureStructureException;
import edu.cmu.cs.lti.avenue.featurespecification.FeatureStructureManager;
import edu.cmu.cs.lti.avenue.trees.smart.SmartTree;
import edu.cmu.cs.lti.avenue.trees.smart.TreeNode;
import edu.cmu.cs.lti.avenue.trees.smart.SmartTree.LabelMode;

/**
 * Given a C structure file containing the desired sentences and a F structure
 * file containing a superset of the desired sentences, produce a file with the
 * C structure following each F structure.
 */
public class CFStructureAligner {

	private static final Logger log = LogUtils.getLogger();

	private static String[] maleNames;
	private static String[] femaleNames;
	private static final String F_STRUCT_EXT = ".txt";

	public static String removePunctuation(String s) {
		for (char c : "`~!@#$%^&*(){}[]:\";'<>,./?\\|-=+".toCharArray()) {
			s = StringUtils.replaceFast(s, c + "", "");
		}
		return s;
	}

	public static String replaceNames(String sentence, String[] maleNames, String[] femaleNames) {
		for (String femaleName : femaleNames)
			sentence = StringUtils.replaceFast(sentence, femaleName, "ANY_FEMALE_NAME");
		for (String maleName : maleNames)
			sentence = StringUtils.replaceFast(sentence, maleName, "ANY_MALE_NAME");

		return sentence;
	}

	public static String replaceNamesIgnoreGender(String sentence, String[] maleNames,
			String[] femaleNames) {
		for (String femaleName : femaleNames)
			sentence = StringUtils.replaceFast(sentence, femaleName, "ANY_NAME");
		for (String maleName : maleNames)
			sentence = StringUtils.replaceFast(sentence, maleName, "ANY_NAME");

		return sentence;
	}

	private static String replaceQuotes(String sentence) {
		sentence = StringUtils.replaceFast(sentence, "``", "\"");
		sentence = StringUtils.replaceFast(sentence, "''", "\"");

		return sentence;
	}

	public static void main(String[] args) throws Exception {

		if (args.length != 1) {
			System.err.println("Usage: program <properties_file>");
			System.exit(1);
		}

		SmartProperties props = new SmartProperties(args[0]);

		File cStructFile = props.getPropertyFile("paths.charniakParses");
		File finalDeliverableOrderedCorpus =
				props.getPropertyFile("paths.finalDeliverableOrderedCorpus");
		File maleNamesFile = props.getPropertyFile("paths.maleNamesFile");
		File femaleNamesFile = props.getPropertyFile("paths.femaleNamesFile");
		File meaningFixesFile = props.getPropertyFile("paths.meaningFixes");
		File xmlFeatureSpec = props.getPropertyFile("paths.featureSpec");
		File outputFile = props.getPropertyFile("paths.cfUntranslatedCorpus");

		Object2ObjectArrayMap<String, String> meaningFixes =
				ElicitedCorpusAligner.loadMeaningFixes(meaningFixesFile);

		FeatureStructureManager fsMan = new FeatureStructureManager(xmlFeatureSpec);

		HashMap<String, String> cStructs = new HashMap<String, String>();
		HashMap<String, String> cStructsLines = new HashMap<String, String>();
		HashSet<String> cStructsUnused = new HashSet<String>();

		// first read in male and female names
		maleNames =
				StringUtils.tokenize(FileUtils.getFileAsString(maleNamesFile).toLowerCase(), "\n");
		femaleNames =
				StringUtils.tokenize(FileUtils.getFileAsString(femaleNamesFile).toLowerCase(), "\n");

		// next, read in the c structs, replacing names

		BufferedReader cIn = new BufferedReader(new FileReader(cStructFile));
		int cStructFileLine = 0;
		String cLine;
		while ((cLine = cIn.readLine()) != null) {

			cStructFileLine++;

			SmartTree tree =
					SmartTree.parse(cLine.trim(), SmartTree.SOURCE_C_STRUCT_LABEL,
							LabelMode.LABEL_ALL_NODES);
			ArrayList<TreeNode> terminals = tree.getTerminalNodes();

			ArrayList<String> remainingTokens = new ArrayList<String>(terminals.size());
			for (final TreeNode terminal : terminals) {
				String lexicalItem = terminal.getValues().get(1);
				remainingTokens.add(lexicalItem);
			}

			// we can ignore gender since we substitute names from the f-structs
			String key = StringUtils.untokenize(remainingTokens);
			key = key.toLowerCase();
			key = replaceNamesIgnoreGender(key, maleNames, femaleNames);
			// key = removePunctuation(key);
			key = replaceQuotes(key);
			// key = ElicitedCorpusAligner.undoMeaningFixes(meaningFixes, key);
			// key = replaceContractions(key);

			cStructs.put(key, cLine);
			cStructsLines.put(key, cStructFile.getName() + ":"
					+ StringUtils.forceNumberLength(cStructFileLine + "", 4));
			cStructsUnused.add(key);
		}

		cIn.close();

		// now try to match the c structures and f structures
		PrintWriter out = new PrintWriter(outputFile);

		Retokenizer retokenizer = new Retokenizer();

		int nAligned = 0;
		int nFS = 0;
		int nUnalignedFS = 0;

		// assert fStructDir.exists() : "Directory does not exist: " +
		// fStructDir;
		// File[] fStructFiles =
		// FileUtils.getFilesFromWildcard(finalDeliverableOrderedCorpus);
		File[] fStructFiles = new File[] { finalDeliverableOrderedCorpus };
		System.out.println("Loaded " + fStructFiles.length + " feature structure files.");
		for (File fStructFile : fStructFiles) {

			BufferedReader fIn = new BufferedReader(new FileReader(fStructFile));
			int fStructFileLine = 0;

			String fLine;
			StringBuilder fStruct = new StringBuilder();
			String srcSentKey = null;
			boolean firstTime = true;
			boolean inMultiply = false;

			while ((fLine = fIn.readLine()) != null) {
				fStructFileLine++;

				if (fLine.startsWith("*** START MULTIPLY ***"))
					inMultiply = true;
				else if (fLine.startsWith("*** END MULTIPLY ***"))
					inMultiply = false;

				if (!inMultiply) {

					if (fLine.startsWith("newpair")) {
						nFS++;

						assert firstTime || srcSentKey != null : "Error processing fStruct: "
								+ fStruct.toString();
						if (firstTime) {
							firstTime = false;
							continue;
						}

						String cStruct = cStructs.get(srcSentKey);
						if (cStruct != null) {

							try {
								// validate the feature structure before
								// moving
								// on
								String strFStruct = fStruct.toString();
								String structOnly =
										StringUtils.substringAfter(strFStruct, "fstruct: ");
								SmartTree structTree =
										SmartTree.parse(structOnly, SmartTree.F_STRUCT_LABEL,
												LabelMode.LABEL_ODD_NODES);
								fsMan.validateFeatureStructure(structTree);

								cStructsUnused.remove(srcSentKey);
								out.print(strFStruct);

								String cStructLine = cStructsLines.get(srcSentKey);
								assert cStructLine != null;
								out.println("cstline: " + cStructLine);
								out.println("cstruct: " + cStruct + "\n\n");
								nAligned++;
							} catch (FeatureStructureException e) {
								System.out.println("ERROR: Error in feature structure BEFORE line: "
										+ fStructFile.getName()
										+ ":"
										+ fStructFileLine
										+ "\n"
										+ e.getMessage());
							} catch (ParseException e) {
								System.out.println("ERROR: Error in parsing feature structure BEFORE line: "
										+ fStructFile.getName()
										+ ":"
										+ fStructFileLine
										+ "\n"
										+ e.getMessage());
							}
						} else {
							System.out.println("WARNING: No matching c-struct found for f-struct sentence: "
									+ srcSentKey);
							nUnalignedFS++;
						}

						// reset for next iteration
						fStruct = new StringBuilder();
						srcSentKey = null;
						fStruct.append("newpair\n" + "fstline: " + fStructFile.getName() + ":"
								+ fStructFileLine + "\n");
					} else {

						if (fLine.equals("")) {
							;
						} else if (fLine.startsWith("(")) {
							fStruct.append("fstruct: " + fLine + "\n");
						} else {
							fStruct.append(fLine + "\n");
						}

						if (fLine.startsWith("srcsent:")) {

							// we can ignore gender since we substitute names
							// from the f-structs
							String value = StringUtils.substringAfter(fLine, "srcsent: ").trim();
							value = value.toLowerCase();
							value = replaceNamesIgnoreGender(value, maleNames, femaleNames);
							// value = removePunctuation(value);
							// System.out.println(value);
							// value =
							//ElicitedCorpusAligner.undoMeaningFixes(meaningFixes
							// ,
							// value);

							Pair<String[], String> p =
									retokenizer.retokenize(StringUtils.tokenize(value), "", false);
							srcSentKey = StringUtils.untokenize(p.first);

							// if (srcSentKey.contains("was not punishing"))
							// System.out.println(srcSentKey);
						}
					}

				} // end !inMultiply
			}

			// try to match the final one
			if (srcSentKey != null) {
				String cStruct = cStructs.get(srcSentKey);
				if (cStruct != null) {
					cStructsUnused.remove(srcSentKey);
					out.print(fStruct.toString());

					String cStructLine = cStructsLines.get(srcSentKey);
					assert cStructLine != null;
					out.println("cstline: " + cStructLine);
					out.println("cstruct: " + cStruct + "\n\n");
					nAligned++;
				}
			}

			fIn.close();
		}
		out.close();

		// print out the unused c-structs in the order of the file and line on
		// which they occurred.
		System.out.println("\n\n\n\n\n\nUNUSED C STRUCTS: ");
		ArrayList<String> unusedList = new ArrayList<String>(cStructsUnused.size());
		for (String cStruct : cStructsUnused)
			unusedList.add(cStructsLines.get(cStruct) + ": " + cStruct);
		Collections.sort(unusedList);
		for (String cStruct : unusedList)
			System.out.println(cStruct);

		System.out.println(cStructsUnused.size() + " unused c structs remaining.");
		System.out.println(nUnalignedFS + " unused f structs remaining.");
		System.out.println(nFS + " feature structures read.");
		System.out.println(nAligned + " successfully aligned.");

		System.out.println("WROTE ALIGNED CF STRUCTURE FILE: " + outputFile.getAbsolutePath());
	}
}
