package cmonson.morphologyChallengeUtilities;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import cmonson.morphologyInduction.languages.Language;
import cmonson.util.FileUtils;

/**
 * This script takes two files containing Morpho Challenge 2007 style segmentations
 * of the exact same wordforms, and produces a single file containing Morpho 
 * Challenge 2007 style segmentations for each word from the original 2 files.
 * All the analyses from the first file are simply added as additional, comma
 * separated, analyses to the analyses for that word from the second file.
 * 
 * @author cmonson
 *
 */

public class MorphoChallengeAnalysisCombiner {

	Language<?> language;
	
	PrintWriter combinedAnalysesOutputWriter = null;
	
	Map<String, String> morfessorSegmentationsByAnalyzedString = new TreeMap<String, String>();
	Map<String, String> paraMorSegmentationsByAnalyzedString = new TreeMap<String, String>();	
	
	Map<String, String> combinedAnalysesByAnalyzedString = new TreeMap<String, String>();
	
	public 
	MorphoChallengeAnalysisCombiner(
			File morfessorSegmentationsFile,
			File paraMorSegmentationsFile, 
			File combinedAnalysesOutputFile) throws IOException {
				
		
		morfessorSegmentationsByAnalyzedString = readSegmentationsFile(morfessorSegmentationsFile);
		paraMorSegmentationsByAnalyzedString = readSegmentationsFile(paraMorSegmentationsFile);
		
		combinedAnalysesOutputWriter =
			FileUtils.openFileForWriting(
					combinedAnalysesOutputFile, 
					"ISO-8859-1");   // latin 1
	}


	/**
	 * @param args the command line arguments which must look like:
	 *
	 * <morfessor-analysis-file> <morphoChallenge-style-output-file>
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		if (args.length != 3) {
			System.out.println("The command line must look like:");
			System.out.println(
			 "    java MorphoChallengeAnalysisCombiner " + String.format("%n") +
			 "        <path-to-a-file-of-Morpho-Challenge-2007-style-Morfessor-segmentations>" + String.format("%n") +
			 "        <path-to-a-file-of-ParaMor-segmentations>" + String.format("%n") +
			 "        <path-to-output-file-containing-combined-analyses>" + String.format("%n%n"));
			System.out.println("    Exiting...");
			System.out.println();
			System.out.println();
			System.exit(0);
		}
		
		MorphoChallengeAnalysisCombiner analysisCombiner = 
			new MorphoChallengeAnalysisCombiner(
					new File(args[0]), 
					new File(args[1]),
					new File(args[2]));
		
		analysisCombiner.combineAnalyses();
		analysisCombiner.writeOutCombinedAnalyses();
	}
	
	//private int calls = 0;
	private Map<String, String> readSegmentationsFile(File segmentationsFile) throws IOException {
		
		int DEBUG = 0;
		
		/*
		calls++;
		if (calls > 1) {
			DEBUG = 1;
		}
		*/
		
		Map<String, String> segmentationsByAnalyzedString = 
			new TreeMap<String, String>();
		
		BufferedReader segmentationsReader =
			FileUtils.openFileForReading(
					segmentationsFile, 
					"ISO-8859-1");  // latin 1

		int lineCounter = 0;
		String lineFromSegmentationsFile;
		while ((lineFromSegmentationsFile = segmentationsReader.readLine()) != null) {
			
			// skip blank lines
			if (lineFromSegmentationsFile.matches("^\\s*$")) {
				continue;
			}
			// skip comments
			if (lineFromSegmentationsFile.matches("^\\s*#.*$")) {
				continue;
			}
			
			lineCounter++;
			if ((lineCounter%10000) == 0) {
				System.err.println("  " + lineCounter + " " + lineFromSegmentationsFile);
				System.err.flush();
			}
			
			Pattern segmentationsPattern = Pattern.compile("^\\s*(\\S+)\\s+(.*)$"); 
			Matcher segmentationsMatcher = 
				segmentationsPattern.matcher(
						lineFromSegmentationsFile);
			boolean matches = segmentationsMatcher.matches();
			if ( ! matches) {
				System.err.println();
				System.err.println("BADLY FORMATTED Morpho Challenge 2007 segmentations File!!");
				System.err.println();
				System.err.println("Line: " + lineFromSegmentationsFile);
				System.err.println();
			}
			String analyzedString = segmentationsMatcher.group(1);
			String segmentationsOfString = segmentationsMatcher.group(2);

			// This was an experiment to see if just including stems from Morfessor and
			// suffixes from ParaMor helped. But it turns out that including everything
			// is the best way to go--including everything significantly helps recall, so
			// each method is finding very different correct things: Morfessor is finding
			// the best stems while ParaMor the best suffixes, but the stems ParaMor finds
			// are often stems Morfessor misses while, the affixes Morfessor finds are often
			// affixes ParaMor misses.
			//segmentationsOfString = getBestMorphemesFromThisAnalyzer(segmentationsOfString);
			
			
			// What happens if each morpheme is its own 'analysis'.
			//
			// Precision and F1 go up less than 1% absolute. Separating each 
			// morpheme into its own analysis makes precision be weighted 
			// equally for each word--and words with many analyzes/morphemes 
			// often have low precision.
			//
			// But discussing this with Alon, we decided that putting each morpheme
			// into its own analysis breaks the spirit of the competition. The
			// Morpho Challenge organizers intended separate analyses to be proposed
			// when an algorithm genuinely thought a word form was morphologically
			// ambiguous, not just to allow rampant guessing to increase recall
			// (at the expense of precision.)
			//
			// If my algorithm performs well, we don't want people to look at our
			// segmentations and think: 'well, this isn't even a morphological
			// analyzer, its just guessing unrelated morphemes.'
			//
			// Another way to look at this is that the real reason boosting recall
			// helps so much is that recall starts off so low, in the 30%s. Once
			// morphological analyzers get good enough to be in the 70%s with
			// recall then silly tricks like this won't help so much.
			//
			//segmentationsOfString = segmentationsOfString.replaceAll(" ", ", ");
			//segmentationsOfString = segmentationsOfString.replaceAll(",,", ",");
			
			if (DEBUG > 0) {
				System.err.println(lineFromSegmentationsFile + " --> " + segmentationsOfString);
			}
			
			segmentationsByAnalyzedString.put(
					analyzedString, 
					segmentationsOfString);
		}
		
		return segmentationsByAnalyzedString;
	}


	@SuppressWarnings("unused")
	private String getBestMorphemesFromThisAnalyzer(String segmentations) {
		String bestMorphemesAsSegmentationAnalyses = "";
		
		String[] morphemes = segmentations.split("\\s+");
		for (String morpheme : morphemes) {
			boolean endOfAnAnalysis = false;
			if (morpheme.matches("^.*,$")) {
				endOfAnAnalysis = true;
				morpheme.replaceAll(",$", "");
			}
			
			boolean includeMorpheme = false;
			
			// include Morfessor Stems
			if (morpheme.matches("^.*/STM")) {
				includeMorpheme = true;
			}
			// include non-Morfessor Affixes
			if (morpheme.matches("^\\+.*") && ( ! morpheme.matches("^.*/((SUF)|(PRE))"))) {
				includeMorpheme = true;
			}
			
			if (includeMorpheme) {
				// Space separate morphemes
				if ((bestMorphemesAsSegmentationAnalyses.length() != 0) &&
					bestMorphemesAsSegmentationAnalyses.matches("^.*\\S")) {
					bestMorphemesAsSegmentationAnalyses += " ";
				}
				bestMorphemesAsSegmentationAnalyses += morpheme;
			}
			
			if (endOfAnAnalysis && 
				(   bestMorphemesAsSegmentationAnalyses.length() != 0) &&
				( ! bestMorphemesAsSegmentationAnalyses.matches("^.*,$"))) {
				bestMorphemesAsSegmentationAnalyses += ",";
			}
		}
		
		// Remove any spurious final commas
		bestMorphemesAsSegmentationAnalyses =
			bestMorphemesAsSegmentationAnalyses.replaceAll(",\\s*$", "");
		
		return bestMorphemesAsSegmentationAnalyses;
	}


	public void combineAnalyses() throws IOException {
		
		if (morfessorSegmentationsByAnalyzedString.keySet().size() != 
			paraMorSegmentationsByAnalyzedString.keySet().size()) {
			
			System.err.println("ERROR: To Combine two sets of Morpho Challenge 2007 style");
			System.err.println("  analyses, each set much contain exactly the same analyses.");
			System.err.println(
					"  But there are:   " + morfessorSegmentationsByAnalyzedString.keySet().size() +
					" analyses in one of the files");
			System.err.println(
					"  While there are: " + paraMorSegmentationsByAnalyzedString.keySet().size() +
					" in the other");
			System.err.println();
			System.err.println("EXITING...");
			System.exit(0);
		}
		
		int countOfAnalyzedStringsInA = 0;
		for (String analyzedStringFromA : morfessorSegmentationsByAnalyzedString.keySet()) {
			countOfAnalyzedStringsInA++;
			if ((countOfAnalyzedStringsInA % 10000) == 0) {
				System.err.println(countOfAnalyzedStringsInA + " analyses have been combined");
			}
			
			String analysesFromA = morfessorSegmentationsByAnalyzedString.get(analyzedStringFromA);
			
			if ( ! paraMorSegmentationsByAnalyzedString.containsKey(analyzedStringFromA)) {
				System.err.println();
				System.err.println("ERROR: The string: |" + analyzedStringFromA + "|");
				System.err.println("  was analyzed in one file but not in the other");
				System.err.println();
				//System.err.println("EXITING...");
				//System.exit(0);
			}
			
			String analysesFromB = paraMorSegmentationsByAnalyzedString.get(analyzedStringFromA);
			
			// Sometimes the ParaMor analysis can be nothing whatever, for
			// example if we are only including ParaMor suffixes and there were
			// no ParaMor suffixes for this analyzed string. (But there is always
			// a Morfessor /STM).
			String combinedAnalyses = analysesFromA;
			if (analysesFromB.length() > 0) {
				combinedAnalyses += ", " + analysesFromB;
			}
			
			combinedAnalysesByAnalyzedString.put(
					analyzedStringFromA, 
					combinedAnalyses);
		}
	}

	private void writeOutCombinedAnalyses() {
		for (String analzyedString : combinedAnalysesByAnalyzedString.keySet()) {
			
			String analysesOfString = 
				combinedAnalysesByAnalyzedString.get(
						analzyedString);
			
			combinedAnalysesOutputWriter.println(
					analzyedString + "\t" + analysesOfString);
		}
	}
}
