package cmonson.morphologyChallengeUtilities;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import cmonson.morphologyInduction.languages.English;
import cmonson.morphologyInduction.languages.German;
import cmonson.morphologyInduction.languages.Language;
import cmonson.util.FileUtils;

/**
 * Morpho Challenge 2007 provides a certain list of words that each need a segmentation.
 * This list of words includes a lot of words containing what ParaMor considers 
 * punctuation, most notably including '-'.
 * This 'script' takes in an official Morpho Challenge list of words and a ParaMor
 * segmentation that was built from those words, and, although each word in the
 * official list may not have a ParaMor segmentation, each punctuation separated
 * piece of each word in the official list should have a ParaMor segmentation. This
 * 'script' sews these punctuation separated word pieces back into a single analysis. 
 * 
 * @author cmonson
 *
 */

public class ParaMorFinalizer {

	Language<?> language;
	
	BufferedReader officialMorphoChallenge2007WordlistReader = null;
	BufferedReader paraMorAnalysesReader = null;
	
	PrintWriter officialFinalAnswersOutputWriter = null;
	
	List<String> officialMorphoChallenge2007Words = new ArrayList<String>();
	Map<String, String> paraMorAnalysesByAnalyzedString = new HashMap<String, String>();
	Map<String, String> finalizedParaMorAnalysesByOfficialWord = new HashMap<String, String>();
	
	public 
	ParaMorFinalizer(
			String languageAsString,
			File officialMorphoChallenge2007WordlistFile,
			File paraMorStyleAnalysesFile, 
			File officialFinalAnswersOutputFile) throws IOException {
		
		languageAsString = languageAsString.toLowerCase();
		
		if (languageAsString.equals("-english")) {
			language = new English();
		} else if (languageAsString.equals("-german")) {
			language = new German();
		} else {
			System.err.println();
			System.err.println("ERROR: The first parameter must be <-english|-german>");
			System.err.println("  But was: |" + languageAsString + "| instead.");
			System.err.println();
			System.err.println("Exiting...");
			System.exit(0);
		}
		
		officialMorphoChallenge2007WordlistReader =
			FileUtils.openFileForReading(
					officialMorphoChallenge2007WordlistFile, 
					"ISO-8859-1");  // latin 1
		
		paraMorAnalysesReader = 
			FileUtils.openFileForReading(paraMorStyleAnalysesFile, "ISO-8859-1");
		
		officialFinalAnswersOutputWriter =
			FileUtils.openFileForWriting(
					officialFinalAnswersOutputFile, 
					"ISO-8859-1");   // latin 1
		
		readOfficialMorphoChallenge2007Wordlist();
		readParaMorAnalyses();
	}


	/**
	 * @param args the command line arguments which must look like:
	 *
	 * <morfessor-analysis-file> <morphoChallenge-style-output-file>
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		if (args.length != 4) {
			System.out.println("The command line must look like:");
			System.out.println(
			 "    java ParaMorFinalizer " + String.format("%n") +
			 "        <-english|-german>" + String.format("%n") +
			 "        <path-to-official-list-of-words-to-segment-for-Morpho-Challenge-2007" + String.format("%n") +
			 "        <path-to-file-of-ParaMor-analyses> " + String.format("%n") +
			 "        <path-to-output-file-containing-official-final-analyses>" + String.format("%n%n"));
			System.out.println("    Exiting...");
			System.out.println();
			System.out.println();
			System.exit(0);
		}
		
		ParaMorFinalizer paraMorFinalizer = 
			new ParaMorFinalizer(
					args[0],
					new File(args[1]), 
					new File(args[2]),
					new File(args[3]));
		
		paraMorFinalizer.finalizeParaMorSegmentations();
		paraMorFinalizer.writeOutFinalizedParaMorSegmentations();
	}
	

	private void readOfficialMorphoChallenge2007Wordlist() throws IOException {
		int lineCounter = 0;
		String lineFromOfficialMorphoChallenge2007Wordlist;
		while ((lineFromOfficialMorphoChallenge2007Wordlist = 
					officialMorphoChallenge2007WordlistReader.readLine()) != null) {
			
			// skip blank lines
			if (lineFromOfficialMorphoChallenge2007Wordlist.matches("^\\s*$")) {
				continue;
			}
			// skip comments
			if (lineFromOfficialMorphoChallenge2007Wordlist.matches("^\\s*#.*$")) {
				continue;
			}
			
			lineCounter++;
			if ((lineCounter%10000) == 0) {
				System.err.println("  " + lineCounter + " " + lineFromOfficialMorphoChallenge2007Wordlist);
				System.err.flush();
			}
			
			Pattern morphoChallengeWordlistPattern = Pattern.compile("^\\s*\\d+\\s+(\\S+).*$"); 
			Matcher morphoChallengeWordlistMatcher = 
				morphoChallengeWordlistPattern.matcher(
						lineFromOfficialMorphoChallenge2007Wordlist);
			boolean matches = morphoChallengeWordlistMatcher.matches();
			if ( ! matches) {
				System.err.println();
				System.err.println("BADLY FORMATTED Official Morpho Challenge Wordlist File!!");
				System.err.println();
				System.err.println("Line: " + lineFromOfficialMorphoChallenge2007Wordlist);
				System.err.println();
			}
			String officialMorphoChallengeWord = morphoChallengeWordlistMatcher.group(1);

			officialMorphoChallenge2007Words.add(officialMorphoChallengeWord);
		}
	}

	private void readParaMorAnalyses() throws IOException {
		int lineCounter = 0;
		String lineFromParaMorAnalyses;
		while ((lineFromParaMorAnalyses = paraMorAnalysesReader.readLine()) != null) {
			
			// skip blank lines
			if (lineFromParaMorAnalyses.matches("^\\s*$")) {
				continue;
			}
			// skip comments
			if (lineFromParaMorAnalyses.matches("^\\s*#.*$")) {
				continue;
			}
			
			lineCounter++;
			if ((lineCounter%10000) == 0) {
				System.err.println("  " + lineCounter + " " + lineFromParaMorAnalyses);
				System.err.flush();
			}
			
			Pattern paraMorAnalysisPattern = Pattern.compile("^\\s*(\\S+)\\s+(.*)$"); 
			Matcher paraMorAnalysisMatcher = 
				paraMorAnalysisPattern.matcher(
						lineFromParaMorAnalyses);
			boolean matches = paraMorAnalysisMatcher.matches();
			if ( ! matches) {
				System.err.println();
				System.err.println("BADLY FORMATTED ParaMor Analysis File!!");
				System.err.println();
				System.err.println("Line: " + lineFromParaMorAnalyses);
				System.err.println();
			}
			String stringAnalyzedByParaMor = paraMorAnalysisMatcher.group(1);
			String paraMorsAnalysesOfString = paraMorAnalysisMatcher.group(2);

			paraMorAnalysesByAnalyzedString.put(
					stringAnalyzedByParaMor, 
					paraMorsAnalysesOfString);
		}
	}


	public void finalizeParaMorSegmentations() throws IOException {
		
		int countOfOfficialWords = 0;
		for (String officialMorphoChallengeWord : officialMorphoChallenge2007Words) {
			countOfOfficialWords++;
			if ((countOfOfficialWords % 10000) == 0) {
				System.err.println(countOfOfficialWords + " official words finalized");
			}
			
			List<String> puncSeparatedWordPieces = 
				language.tokenize(officialMorphoChallengeWord, true);
			 
			boolean firstWordPiece = true;
			String combinedAnalysesOfPuncSeparatedWordPieces = "";
			for (String puncSeparatedWordPiece : puncSeparatedWordPieces) {
				
				if ( ! paraMorAnalysesByAnalyzedString.containsKey(puncSeparatedWordPiece)) {
					System.err.println(
							"ERROR: The Official word: " + officialMorphoChallengeWord);
					System.err.println(
							"  contains the punctuation separated word piece: " + 
							puncSeparatedWordPiece);
					System.err.println(
							"  But this word piece was NOT analyzed by ParaMor");
					
					continue;
				}
				
				if (firstWordPiece) firstWordPiece = false;
				else combinedAnalysesOfPuncSeparatedWordPieces += ", ";
				
				combinedAnalysesOfPuncSeparatedWordPieces += 
					paraMorAnalysesByAnalyzedString.get(puncSeparatedWordPiece);
			}
			
			// If this 'official Morpho Challenge 2007 word' consists entirely of
			// characters that are not part of the language. The output the segmentation
			// that consists of just the 'official word' itself
			if (combinedAnalysesOfPuncSeparatedWordPieces.length() == 0) {
				combinedAnalysesOfPuncSeparatedWordPieces = officialMorphoChallengeWord;
			}
			
			finalizedParaMorAnalysesByOfficialWord.put(
					officialMorphoChallengeWord, 
					combinedAnalysesOfPuncSeparatedWordPieces);
		}
	}

	private void writeOutFinalizedParaMorSegmentations() {
		for (String officialMorphoChallengeWord : officialMorphoChallenge2007Words) {
			
			String paraMorFinalizedAnswer = 
				finalizedParaMorAnalysesByOfficialWord.get(
						officialMorphoChallengeWord);
			
			officialFinalAnswersOutputWriter.println(
					officialMorphoChallengeWord + "\t" + paraMorFinalizedAnswer);
		}
	}
}
