package cmonson.morphologyChallengeUtilities;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import cmonson.util.FileUtils;

public class MorfessorProcessor_LeaveOnlyFinalSufix {

	BufferedReader morfessorAnalysesReader = null;
	PrintWriter processedMorfessorWriter = null;
	
	boolean analysesInMorfessorStyle = true;
	
	public 
	MorfessorProcessor_LeaveOnlyFinalSufix(
			File morfessorStyleAnalyses, 
			File processedMorfessorFile) {
		
		analysesInMorfessorStyle = false;
		
		morfessorAnalysesReader = 
			FileUtils.openFileForReading(morfessorStyleAnalyses, "ISO-8859-1"); //latin 1
		
		processedMorfessorWriter = 
			FileUtils.openFileForWriting(processedMorfessorFile, "ISO-8859-1"); // latin 1
	}
	

	/**
	 * @param args the command line arguments which must look like:
	 *
	 * <morfessor-analysis-file> <morphoChallenge-style-output-file>
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		if (args.length != 2) {
			System.out.println("The command line must look like:");
			System.out.println(
			 "    java MorfessorProcessor " + String.format("%n") +
			 "        <path-to-file-of-morpho-Challenge-style-Morfessor-analyses> " + String.format("%n") +
			 "        <path-to-processed-file-output>" + String.format("%n%n"));
			System.out.println("    Exiting...");
			System.out.println();
			System.out.println();
			System.exit(0);
		}
		
		MorfessorProcessor_LeaveOnlyFinalSufix converter = 
			new MorfessorProcessor_LeaveOnlyFinalSufix(
					new File(args[0]), 
					new File(args[1]));
		
		converter.convert();
	}

	public void convert() throws IOException {
		
		int lineCounter = 0;
		String lineFromMorfessorStyleAnalyses;
		while ((lineFromMorfessorStyleAnalyses = 
					morfessorAnalysesReader.readLine()) != null) {
			
			// skip blank lines
			if (lineFromMorfessorStyleAnalyses.matches("^\\s*$")) {
				continue;
			}
			// skip morfessor comments
			if (lineFromMorfessorStyleAnalyses.matches("^\\s*#.*$")) {
				continue;
			}
			
			lineCounter++;
			if ((lineCounter%1000) == 0) {
				System.err.println("  " + lineCounter + " " + lineFromMorfessorStyleAnalyses);
				System.err.flush();
			}
			
			// Each line holds a frequency count of the type and an analysis of the type:
			//
			// 1581 esta/STM
			// 1579 unido/STM + s/SUF
			// 1543 esta/STM + do/SUF
			// 1541 pasa/STM + do/SUF
			// 1537 ha/STM
			// 1510 parti/STM + do/SUF

			// Throw away the frequency count
			Pattern citationFormPattern;
			if (analysesInMorfessorStyle) {
				citationFormPattern = Pattern.compile("^\\s*(\\d+) (.*)$");
				
			// Each line holds a Morpho Challenge style analysis
			//
			// <word> \t <space separated analysis>
			} else {
				citationFormPattern = Pattern.compile("^(\\S+)\\s+(.*)$");
			}
			Matcher citationFormMatcher = 
				citationFormPattern.matcher(lineFromMorfessorStyleAnalyses);
			boolean matches = citationFormMatcher.matches();
			if ( ! matches) {
				System.err.println();
				System.err.println("BADLY FORMATTED Morfessor Analysis File!!");
				System.err.println();
				System.err.println("Line: " + lineFromMorfessorStyleAnalyses);
				System.err.println();
			}
			String morfessorAnalysis = citationFormMatcher.group(2);
			
			String newMorphoChallengeStyleAnalysis;
			if (analysesInMorfessorStyle) {
				newMorphoChallengeStyleAnalysis =
					getMorphoChallengeStyleAnalysisKeepingOnlyTheWordFinalSuffixes(
							morfessorAnalysis);
			} else {
				String wordform = citationFormMatcher.group(1);
				newMorphoChallengeStyleAnalysis = wordform + "\t";
				newMorphoChallengeStyleAnalysis += 
					keepOnlyWordFinalSuffixes(
							morfessorAnalysis);
			}
			
			processedMorfessorWriter.println(newMorphoChallengeStyleAnalysis);
		}
	}

	// a morfessorAnalysis looks like: 'unido/STM + s/SUF'
	//
	// reconstruct the original surface form of the type by concatenating
	// the analyzed 'morphemes' from the morfessor output.
	//
	// i.e. unido/STM + s/SUF --> unidos
	//
	private String 
	getMorphoChallengeStyleAnalysisKeepingOnlyTheWordFinalSuffixes(String morfessorAnalysis) {
		
		int DEBUG = 0;
		
		if (DEBUG > 0) {
			System.err.println("Converting the Morfessor Analysis: " + morfessorAnalysis);
		}
		
		String morphoChallengeStyleAnalysis = "";
		
		morphoChallengeStyleAnalysis += 
			MorfessorToMorphoChallengeAnalysis.getTypeFromMorfessorAnalysis(
					morfessorAnalysis);
		morphoChallengeStyleAnalysis += "\t";
		
		String[] annotatedMorphemes = morfessorAnalysis.split("\\s+\\+\\s+");
				
		Pattern annotatedMorphemePattern = Pattern.compile("^([^/]+)/(.*)$");
		
		for (
				int annotatedMorphemeIndex = 0; 
				annotatedMorphemeIndex < annotatedMorphemes.length; 
				annotatedMorphemeIndex++) {
			
			String annotatedMorpheme = annotatedMorphemes[annotatedMorphemeIndex];

			if (DEBUG > 0) {
				System.err.println("  Converting the annotated morpheme: " + annotatedMorpheme);
			}
			
			Matcher annotatedMorphemeMatcher = 
				annotatedMorphemePattern.matcher(annotatedMorpheme);
			annotatedMorphemeMatcher.matches();
			String morpheme = annotatedMorphemeMatcher.group(1);
			String morphemeType = annotatedMorphemeMatcher.group(2);
			
			if (annotatedMorphemeIndex != (annotatedMorphemes.length - 1) ) {
				if (DEBUG > 0) {
					System.err.println("    which is NOT a word final morpheme");
					System.err.println("    concatenating: |" + morpheme + "| onto the analysis");
				}
				morphoChallengeStyleAnalysis += morpheme;
				continue;
			} 
			
			if (DEBUG > 0) {
				System.err.println("    which IS a word final morpheme");
			}
			if (morphemeType.equals("SUF")) {
				if (DEBUG > 0) {
					System.err.println("    concatenating: |+" + annotatedMorpheme + "| onto the analysis");
				}
				morphoChallengeStyleAnalysis += " +" + annotatedMorpheme;
			
			} else {
				morphoChallengeStyleAnalysis += morpheme;
			}
		}
		
		return morphoChallengeStyleAnalysis;
	}

	private String keepOnlyWordFinalSuffixes(String morfessorAnalysis) {
		
		int DEBUG = 0;
		
		if (DEBUG > 0) {
			System.err.println("Converting the Analysis: " + morfessorAnalysis);
		}
		
		String rejoinedAnalysis = "";
		
		String[] annotatedMorphemes = morfessorAnalysis.split("\\s+");
				
		Pattern annotatedMorphemePattern = Pattern.compile("^\\+?([^/]+)/(.*)$");
		
		for (
				int annotatedMorphemeIndex = 0; 
				annotatedMorphemeIndex < annotatedMorphemes.length; 
				annotatedMorphemeIndex++) {
			
			String annotatedMorpheme = annotatedMorphemes[annotatedMorphemeIndex];

			if (DEBUG > 0) {
				System.err.println("  Converting the annotated morpheme: " + annotatedMorpheme);
			}
			
			Matcher annotatedMorphemeMatcher = 
				annotatedMorphemePattern.matcher(annotatedMorpheme);
			annotatedMorphemeMatcher.matches();
			String morpheme = annotatedMorphemeMatcher.group(1);
			String morphemeType = annotatedMorphemeMatcher.group(2);
			
			if (annotatedMorphemeIndex != (annotatedMorphemes.length - 1) ) {
				if (DEBUG > 0) {
					System.err.println("    which is NOT a word final morpheme");
					System.err.println("    concatenating: |" + morpheme + "| onto the analysis");
				}
				rejoinedAnalysis += morpheme;
				continue;
			} 
			
			if (DEBUG > 0) {
				System.err.println("    which IS a word final morpheme");
			}
			if (morphemeType.equals("SUF")) {
				if (DEBUG > 0) {
					System.err.println("    concatenating: |+" + annotatedMorpheme + "| onto the analysis");
				}
				rejoinedAnalysis += " " + annotatedMorpheme;
			
			} else {
				rejoinedAnalysis += morpheme;
			}
		}
		
		return rejoinedAnalysis;
	}

}
