package cmonson.morphologyChallengeUtilities;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import cmonson.util.FileUtils;

public class MorfessorToMorphoChallengeAnalysis {

	BufferedReader morfessorStyleAnalysesReader = null;
	PrintWriter morphoChallengeStyleAnalysesWriter = null;
	
	public 
	MorfessorToMorphoChallengeAnalysis(
			File morfessorStyleAnalyses, 
			File morphoChallengeStyleAnalyses) {
		
		morfessorStyleAnalysesReader = 
			FileUtils.openFileForReading(morfessorStyleAnalyses, "ISO-8859-1"); //latin 1
		
		morphoChallengeStyleAnalysesWriter = 
			FileUtils.openFileForWriting(morphoChallengeStyleAnalyses, "ISO-8859-1"); // latin 1
	}
	
	/**
	 * @param args the command line arguments which must look like:
	 *
	 * <morfessor-analysis-file> <morphoChallenge-style-output-file>
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		if (args.length != 2) {
			System.out.println("The command line must look like:");
			System.out.println(
			 "    java MorfessorToMorphoChallengeAnalysis " + String.format("%n") +
			 "        <path-to-file-of-morfessor-style-analyses> " + String.format("%n") +
			 "        <path-to-morphoChallenge-style-output-file>" + String.format("%n%n"));
			System.out.println("    Exiting...");
			System.out.println();
			System.out.println();
			System.exit(0);
		}
		
		MorfessorToMorphoChallengeAnalysis converter = 
			new MorfessorToMorphoChallengeAnalysis(
					new File(args[0]), 
					new File(args[1]));
		
		converter.convert();
	}

	public void convert() throws IOException {
		
		int lineCounter = 0;
		String lineFromMorfessorStyleAnalyses;
		while ((lineFromMorfessorStyleAnalyses = 
					morfessorStyleAnalysesReader.readLine()) != null) {
			
			// skip blank lines
			if (lineFromMorfessorStyleAnalyses.matches("^\\s*$")) {
				continue;
			}
			// skip morfessor comments
			if (lineFromMorfessorStyleAnalyses.matches("^\\s*#.*$")) {
				continue;
			}
			
			lineCounter++;
			if ((lineCounter%1000) == 0) {
				System.err.println("  " + lineCounter + " " + lineFromMorfessorStyleAnalyses);
				System.err.flush();
			}
			
			// Each line holds a frequency count of the type and an analysis of the type:
			//
			// 1581 esta/STM
			// 1579 unido/STM + s/SUF
			// 1543 esta/STM + do/SUF
			// 1541 pasa/STM + do/SUF
			// 1537 ha/STM
			// 1510 parti/STM + do/SUF

			// Throw away the frequency count
			Pattern citationFormPattern = Pattern.compile("^\\s*\\d+ (.*)$"); 
			Matcher citationFormMatcher = 
				citationFormPattern.matcher(lineFromMorfessorStyleAnalyses);
			boolean matches = citationFormMatcher.matches();
			if ( ! matches) {
				System.err.println();
				System.err.println("BADLY FORMATTED Morfessor Analysis File!!");
				System.err.println();
				System.err.println("Line: " + lineFromMorfessorStyleAnalyses);
				System.err.println();
			}
			String morfessorAnalysis = citationFormMatcher.group(1);
			
			// reconstruct the original surface form of the type by concatenating
			// the analyzed 'morphemes' from the morfessor output.
			// i.e. unido/STM + s/SUF --> unidos
			String type = getTypeFromMorfessorAnalysis(morfessorAnalysis);
			
			// Convert the morfessor analysis into a morphoChallenge style analysis
			//
			// i.e. de/PRE + tener/STM + se/SUF --> +de/PRE tener/STM +se/SUF
			// 
			// we retain the prefix and suffix information to differentiate prefixes
			// and suffixes that have identical surface form strings.
			String morphoChallengeStyleAnalysis = 
				getMorphoChallengeStyleAnalysisFromMorfessorAnalysis(morfessorAnalysis);
			
			morphoChallengeStyleAnalysesWriter.println(
					type + "\t" + morphoChallengeStyleAnalysis);
		}
	}

	// a morfessorAnalysis looks like: 'unido/STM + s/SUF'
	//
	// reconstruct the original surface form of the type by concatenating
	// the analyzed 'morphemes' from the morfessor output.
	//
	// i.e. unido/STM + s/SUF --> unidos
	//
	public static String getTypeFromMorfessorAnalysis(String morfessorAnalysis) {
		String type = "";
		String[] annotatedMorphemes = morfessorAnalysis.split("\\s+\\+\\s+");
		for (String annotatedMorpheme : annotatedMorphemes) {
			// Strip off the annotation of '/STM', '/SUF', etc.
			String morpheme = annotatedMorpheme.replaceAll("/.*$", "");
			type += morpheme;
		}
		return type;
	}

	// Convert the morfessor analysis into a morphoChallenge style analysis
	//
	// i.e. de/PRE + tener/STM + se/SUF --> +de/PRE tener/STM +se/SUF
	// 
	// we retain the prefix and suffix information to differentiate prefixes
	// and suffixes that have identical surface form strings. 
	//
	private String getMorphoChallengeStyleAnalysisFromMorfessorAnalysis(String morfessorAnalysis) {
		String morphoChallengeStyleAnalysis = "";
		
		String[] morfessorStyleAnnotatedMorphemes = morfessorAnalysis.split("\\s+\\+\\s+");
		
		List<String> morphoChallengeStyleAnnotatedMorphemes = new ArrayList<String>();
		for (String morfessorStyleAnnotatedMorpheme : morfessorStyleAnnotatedMorphemes) {
			
			if (morfessorStyleAnnotatedMorpheme.matches("^.*/PRE$") || 
				morfessorStyleAnnotatedMorpheme.matches("^.*/SUF$")) {
				
				morphoChallengeStyleAnnotatedMorphemes.add(
						"+" + morfessorStyleAnnotatedMorpheme);
				
			} else {
				morphoChallengeStyleAnnotatedMorphemes.add(
						morfessorStyleAnnotatedMorpheme);
			}
		}
		
		boolean first = true;
		for (String morphoChallengeStyleAnnotatedMorpheme : 
							morphoChallengeStyleAnnotatedMorphemes) {
			
			if (first) {
				first = false;
			} else {
				morphoChallengeStyleAnalysis += " ";
			}
			
			morphoChallengeStyleAnalysis += morphoChallengeStyleAnnotatedMorpheme;
		}
		
		return morphoChallengeStyleAnalysis;
	}    

}
