package cmonson.morphologyChallengeUtilities;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

public class RecombineParaMorAndMorfessorSegmentations {

	BufferedReader paraMorSegmentationsOfMorfessorRootsReader = null;
	BufferedReader morfessorStyleAnalysesReader = null;
	PrintWriter processedMorfessorWriter = null;
	
	Map<String, String[]> paraMorAnalysesOfRootsByRoot =	new HashMap<String, String[]>();
	
	public 
	RecombineParaMorAndMorfessorSegmentations(
			File paraMorSegmentationsOfMorfessorRoots,
			File morfessorStyleAnalyses, 
			File recombinedOutputFile) {
		
		paraMorSegmentationsOfMorfessorRootsReader =
			openFileForReading(paraMorSegmentationsOfMorfessorRoots, "UTF-8");
		
		morfessorStyleAnalysesReader = 
			openFileForReading(morfessorStyleAnalyses, "ISO-8859-1"); //latin 1
		
		processedMorfessorWriter = 
			openFileForWriting(recombinedOutputFile, "ISO-8859-1"); // latin 1
	}
	
	public BufferedReader openFileForReading(File fileToOpen, String encoding) {
		
		BufferedReader bufferedReaderToReturn = null;
		
		try {
			if (fileToOpen.getName().matches("^.*\\.gz$")) {

				// Reading gzipped files
				bufferedReaderToReturn = 
					new BufferedReader(
							new InputStreamReader(
									new GZIPInputStream (
											new FileInputStream(fileToOpen)),
											encoding));
			} else {
				// Reading un-gzipped files
				bufferedReaderToReturn = 
					new BufferedReader(
							new InputStreamReader(
									new FileInputStream(fileToOpen),
								    encoding));			
			}
		}
		catch(FileNotFoundException e) {	
			System.err.println();
			System.err.println("  Sorry.  The file: " + fileToOpen.getAbsolutePath());
			System.err.println("    could not be read.  Here is the full Java error:");
			System.err.println();
			System.err.println(e.getMessage());
			System.err.println();
			System.err.println("  Did NOT successfully set the corpus path.");
			System.err.println();
			System.exit(0);
		}
		catch(Exception e) {
			System.err.println();
			System.err.println("  Sorry.  While opening the file: " + fileToOpen.getAbsolutePath());
			System.err.println("    an error was encountered.  Here is the full Java error:");
			System.err.println();
			System.err.println(e.getMessage());
			System.err.println();
			System.err.println("  Did NOT successfully set the corpus path.");
			System.err.println();
			System.exit(0);
		}
		
		return bufferedReaderToReturn;
	}
	
	private PrintWriter openFileForWriting(File fileToOpen, String encoding) {

		PrintWriter printWriterToReturn = null;
		
		try {
			printWriterToReturn = 
				new PrintWriter(
						new BufferedWriter(
								new OutputStreamWriter(
										new FileOutputStream(fileToOpen),
										encoding)),
						true); // true to autoflush
			
		} catch (FileNotFoundException e) {
			System.err.println();
			System.err.println("Cannot set the output file:");
			System.err.println("  " + fileToOpen.getAbsolutePath());
			e.printStackTrace();
			System.err.println();
			System.exit(0);

		} catch (IOException e) {
			System.err.println("Failed to open the output file because");
			System.err.println("  of the following internal error:");
			e.printStackTrace();
			System.err.println();
			System.exit(0);
		}
		
		return printWriterToReturn;
	}


	/**
	 * @param args the command line arguments which must look like:
	 *
	 * <morfessor-analysis-file> <morphoChallenge-style-output-file>
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		if (args.length != 3) {
			System.out.println("The command line must look like:");
			System.out.println(
			 "    java MorfessorProcessor " + String.format("%n") +
			 "        <path-to-ParaMor-segmented-roots-that-Morfessor-identified> " + String.format("%n") +
			 "        <path-to-file-of-morfessor-style-analyses> " + String.format("%n") +
			 "        <path-to-combined-analysis-output>" + String.format("%n%n"));
			System.out.println("    Exiting...");
			System.out.println();
			System.out.println();
			System.exit(0);
		}
		
		RecombineParaMorAndMorfessorSegmentations recombiner = 
			new RecombineParaMorAndMorfessorSegmentations(
					new File(args[0]), 
					new File(args[1]),
					new File(args[2]));
		
		recombiner.readParaMorSegmentationsOfMorfessorRoots();
		recombiner.recombine();
	}

	private void readParaMorSegmentationsOfMorfessorRoots() throws IOException {

		int lineCounter = 0;
		String lineFromParaMorSegmentationsOfMorfessorRoots;
		while ((lineFromParaMorSegmentationsOfMorfessorRoots = 
			paraMorSegmentationsOfMorfessorRootsReader.readLine()) != null) {

			// skip blank lines
			if (lineFromParaMorSegmentationsOfMorfessorRoots.matches("^\\s*$")) {
				continue;
			}
			// skip morfessor comments
			if (lineFromParaMorSegmentationsOfMorfessorRoots.matches("^\\s*#.*$")) {
				continue;
			}

			lineCounter++;
			if ((lineCounter%1000) == 0) {
				System.err.println("  " + lineCounter + " " + lineFromParaMorSegmentationsOfMorfessorRoots);
				System.err.flush();
			}

			// Each line is of the form:
			//
			// <word-form>\t<space-separated-analysis>, <space-separated-analysis>...
			//
			// aachen	a +achen, aa +chen, aach +en, aache +n
			// aachener	aach +ener, aache +ner, aachen +er
			// aag	a +ag, aa +g

			// get the word form and all the analyses
			Pattern paraMorAnalysisPattern = Pattern.compile("^(\\S+)\\s+(.+)$"); 
			Matcher paraMorAnalysisMatcher = 
				paraMorAnalysisPattern.matcher(lineFromParaMorSegmentationsOfMorfessorRoots);
			boolean matches = paraMorAnalysisMatcher.matches();
			if ( ! matches) {
				System.err.println();
				System.err.println("BADLY FORMATTED ParaMor Analysis File!!");
				System.err.println();
				System.err.println("Line: |" + lineFromParaMorSegmentationsOfMorfessorRoots + "|");
				System.err.println();
			}
			String rootIdentifiedByMorfessor = paraMorAnalysisMatcher.group(1);
			String paraMorAnalysesOfRoot     = paraMorAnalysisMatcher.group(2);
			
			String[] paraMorAnalysesOfRootAsArray = paraMorAnalysesOfRoot.split("\\s*,\\s*");
			
			paraMorAnalysesOfRootsByRoot.put(
					rootIdentifiedByMorfessor,
					paraMorAnalysesOfRootAsArray);
		}
	}


	public void recombine() throws IOException {
		
		int lineCounter = 0;
		String lineFromMorfessorStyleAnalyses;
		while ((lineFromMorfessorStyleAnalyses = 
					morfessorStyleAnalysesReader.readLine()) != null) {
			
			// skip blank lines
			if (lineFromMorfessorStyleAnalyses.matches("^\\s*$")) {
				continue;
			}
			// skip morfessor comments
			if (lineFromMorfessorStyleAnalyses.matches("^\\s*#.*$")) {
				continue;
			}
			
			lineCounter++;
			if ((lineCounter%1000) == 0) {
				System.err.println("  " + lineCounter + " " + lineFromMorfessorStyleAnalyses);
				System.err.flush();
			}
			
			// Each line holds a frequency count of the type and an analysis of the type:
			//
			// 1581 esta/STM
			// 1579 unido/STM + s/SUF
			// 1543 esta/STM + do/SUF
			// 1541 pasa/STM + do/SUF
			// 1537 ha/STM
			// 1510 parti/STM + do/SUF

			// Throw away the frequency count
			Pattern citationFormPattern = Pattern.compile("^\\s*\\d+ (.*)$"); 
			Matcher citationFormMatcher = 
				citationFormPattern.matcher(lineFromMorfessorStyleAnalyses);
			boolean matches = citationFormMatcher.matches();
			if ( ! matches) {
				System.err.println();
				System.err.println("BADLY FORMATTED Morfessor Analysis File!!");
				System.err.println();
				System.err.println("Line: " + lineFromMorfessorStyleAnalyses);
				System.err.println();
			}
			String morfessorAnalysis = citationFormMatcher.group(1);
			
			List<String> prefixesAndRoots =	getPrefixesAndRoots(morfessorAnalysis);
			
			String wordform = "";
			Set<String> combinedAnalyses = new HashSet<String>();
			for (String prefixOrRoot : prefixesAndRoots) {
		
				Set<String> newCombinedAnalyses = new HashSet<String>();
				
				// A Prefix
				if (prefixOrRoot.matches("^.*/PRE")) {
					String prefix = prefixOrRoot.replaceAll("/PRE", "");
					wordform += prefix;
					
					// There are no analyses yet
					if (combinedAnalyses.size() == 0) {
						newCombinedAnalyses.add("+" + prefixOrRoot);
						
					// Continue expanding each combined analysis
					} else {
						for (String combinedAnalysis : combinedAnalyses) {
							newCombinedAnalyses.add(combinedAnalysis + " +" + prefixOrRoot);
						}
					}
					
				// A Root
				} else {
					
					wordform += prefixOrRoot;
					
					if ( ! paraMorAnalysesOfRootsByRoot.containsKey(prefixOrRoot)) {
						System.err.println();
						System.err.println("ERROR: Morfessor found a root that was not");
						System.err.println("  analyzed by ParaMor. Something is odd.");
						System.err.println("  The root is: " + prefixOrRoot);
					}
					String[] paraMorAnalysesOfRoot = 
						paraMorAnalysesOfRootsByRoot.get(prefixOrRoot);
					
					for (String paraMorAnalysisOfRoot : paraMorAnalysesOfRoot) {
						
						// There are no analyses yet
						if (combinedAnalyses.size() == 0) {
							newCombinedAnalyses.add(paraMorAnalysisOfRoot);
							
						// Continue expanding each combined analysis
						} else {
							for (String combinedAnalysis : combinedAnalyses) {
								newCombinedAnalyses.add(
										combinedAnalysis + " " + paraMorAnalysisOfRoot);
							}
						}
					}
				}
				
				// Here is something fine to do in Java but I'm not sure it would
				// work in C++. In Java, this statement causes 'combinedAnalyses' to
				// point to the thing that 'newCombinedAnalyses' referenced.
				combinedAnalyses = newCombinedAnalyses;

			}
			
			// Write out combined analysis
			processedMorfessorWriter.print(wordform + "\t");
			boolean first = true;
			for (String combinedAnalysis : combinedAnalyses) {
				if (first) first = false;
				else processedMorfessorWriter.print(", ");
				
				processedMorfessorWriter.print(combinedAnalysis);
			}
			processedMorfessorWriter.println();
		}
	}

	// a morfessorAnalysis looks like: 'unido/STM + s/SUF'
	//
	// reconstruct the original surface form of the type by concatenating
	// the analyzed 'morphemes' from the morfessor output.
	//
	// i.e. unido/STM + s/SUF --> unidos
	//
	private List<String> getPrefixesAndRoots(String morfessorAnalysis) {
		List<String> prefixesAndRoots = new ArrayList<String>();
		
		String[] annotatedMorphemes = morfessorAnalysis.split("\\s+\\+\\s+");
				
		Pattern annotatedMorphemePattern = Pattern.compile("^([^/]+)/(.*)$");
		String thisStemAndSuffixes = null;
		boolean inAStemAndSuffixes = false;
		for (String annotatedMorpheme : annotatedMorphemes) {
			
			Matcher annotatedMorphemeMatcher = 
				annotatedMorphemePattern.matcher(annotatedMorpheme);
			annotatedMorphemeMatcher.matches();
			String morpheme = annotatedMorphemeMatcher.group(1);
			String morphemeType = annotatedMorphemeMatcher.group(2);
			
			if (inAStemAndSuffixes) {
				if (morphemeType.equals("PRE")) {
					if (thisStemAndSuffixes == null) {
						System.err.println("BADLY FORMED MORFESSOR ANALYSIS(1): " + morfessorAnalysis);
					}
					prefixesAndRoots.add(thisStemAndSuffixes);  // add the accumulated root
					prefixesAndRoots.add(annotatedMorpheme);    // add this new prefix
					thisStemAndSuffixes = null;
					inAStemAndSuffixes = false;
					
				} else if (morphemeType.equals("STM")) {
					if (thisStemAndSuffixes == null) {
						System.err.println("BADLY FORMED MORFESSOR ANALYSIS(2): " + morfessorAnalysis);
					}
					prefixesAndRoots.add(thisStemAndSuffixes);  // add the accululated root
					thisStemAndSuffixes = morpheme;             // start a new root
					inAStemAndSuffixes = true;
					

				} else if (morphemeType.equals("SUF")) {
					if (thisStemAndSuffixes == null) {
						System.err.println("BADLY FORMED MORFESSOR ANALYSIS(3): " + morfessorAnalysis);
					}
					thisStemAndSuffixes += morpheme;            // continue the current root 
					
				} else {
					System.err.println("BADLY FORMED MORFESSOR ANALYSIS(4): " + morfessorAnalysis);
				}
				
			// We are not inAStemAndSuffixes
			} else {
				if (morphemeType.equals("PRE")) { 
					if (thisStemAndSuffixes != null) {
						System.err.println("BADLY FORMED MORFESSOR ANALYSIS(5): " + morfessorAnalysis);
					}
					prefixesAndRoots.add(annotatedMorpheme);    // add this new prefix
					
				} else if (morphemeType.equals("STM")) {
					if (thisStemAndSuffixes != null) {
						System.err.println("BADLY FORMED MORFESSOR ANALYSIS(6): " + morfessorAnalysis);
					}
					thisStemAndSuffixes = morpheme;             // start a new root
					inAStemAndSuffixes = true;

				} else if (morphemeType.equals("SUF")) {
					System.err.println("BADLY FORMED MORFESSOR ANALYSIS(7): " + morfessorAnalysis);
					
				} else {
					System.err.println("BADLY FORMED MORFESSOR ANALYSIS(8): " + morfessorAnalysis);
				}
			
			}
		}
		
		if (thisStemAndSuffixes != null) {
			prefixesAndRoots.add(thisStemAndSuffixes);          // add the accumulated root
		} else {
			System.err.println("BADLY FORMED MORFESSOR ANALYSIS(9): " + morfessorAnalysis);
		}
		
		if (prefixesAndRoots.size() == 0) {
			System.err.println("WE FOUND NO STEMS in this Morfessor Analysis. NOT GOOD: " + morfessorAnalysis);
		}
		
		return prefixesAndRoots;
	}

}
