
package quechua_morphology_analysis;

import java.io.*;
import java.util.*;
import java.util.regex.*;

/**
 *
 * @author cmonson
 */
public class QuechuaSuffixParser {
    
    /* Member Variables */
    private Hashtable<String, Vector<Suffix>> hashOfSuffixes = null;   //Hash table of Suffixes
    private Hashtable<String, Vector<Stem>>   hashOfStems    = null;   //Hash table of Stems
     
    
    /** Creates a new instance of QuechuaSuffixParser */
    public QuechuaSuffixParser() {
        
	hashOfSuffixes = new Hashtable<String, Vector<Suffix>>();
        hashOfStems    = new Hashtable<String, Vector<Stem>>();
    }
    
    
    /** Parse the data file containing the stems of Quechua.
     * The file must be in the following format:
     *   Each row represents a single stem
     *   There are 3 comma (',') delimited fields in each row
     *   The first column is the stem graph, i.e. the character
     *     string that is the surface realization of the stem
     *   The second column is the part of speech category of the stem
     *   The third column contains the f-structure associated with this
     *     suffix that will eventually need to be returned to the
     *     transfer engine. 
     */     
    public void readStemDataFile(String stemDataFileName) {
	
        int DEBUG = 0;

        if (DEBUG > 0) {
            System.out.println("reading the stem data file and building the hashOfStems data structure");
        }
        
        Pattern stemDataPattern  = Pattern.compile("^\\s*([^,\\s]+)\\s*,\\s*([^,\\s]+)\\s*,\\s*(.*)\\s*$");
        Pattern commentPattern   = Pattern.compile("^\\s*#");
        Pattern blankLinePattern = Pattern.compile("^\\s*$");
        
        try {
            BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(stemDataFileName)));

            String line;
	    
            while ((line = br.readLine()) != null) {
                
                if (DEBUG > 0) {
                    System.out.print("\n\nline: ");
                    System.out.println(line);
                    System.out.println();
                }
            
                Matcher commentMatcher = commentPattern.matcher(line);
                // lookingAt() returns true if even a prefix of the string matches
                if (commentMatcher.lookingAt()) {  
                    if (DEBUG > 0) {
                        System.out.println("\tFound a comment line, skipping...\n");
                    }
                    continue;  // go on to the next line of the stem data file
                }
	
                Matcher blankLineMatcher = blankLinePattern.matcher(line);
                if (blankLineMatcher.matches()) {
                    if (DEBUG > 0) {
                        System.out.println("\tFound a Blank line, skipping...\n");
                    }
                    continue;  // go on to the next line of the stem data file
                }
	                
                Matcher stemDataMatcher = stemDataPattern.matcher(line);
                Boolean stemDataWellFormated = stemDataMatcher.matches();
                if ( ! stemDataWellFormated) {
                    System.err.print("\nIll-formated line in the stem data file: ");
                    System.err.println(stemDataFileName);
                    System.err.println("\n\tEach line in the stem data file must look like:\n");
                    System.err.println("\t<suffix-graph>, <suffix-POS>, <f-structure>\n");
                    System.err.println("The offending line is:\n");
                    System.err.print("|");
                    System.err.print(line);
                    System.err.println("|");
                    System.err.println("\nExiting...\n");
                    System.err.flush();
                    System.exit(0);
                }
            
                String stemGraph  = stemDataMatcher.group(1);
                String stemPOS    = stemDataMatcher.group(2);
                String fStructure = stemDataMatcher.group(3);
    
                if (DEBUG > 0) {
                    System.out.print("stemGraph: ");
                    System.out.println(stemGraph);
                    System.out.print("stemPOS: ");
                    System.out.println(stemPOS);
                    System.out.print("fStructure: ");
                    System.out.println(fStructure);
                }
            
                /* Create Stem Object that corrosponds to the row of the
                 * suffix data file that was just read in.
                 */
                Stem stem = new Stem(stemGraph, stemPOS, fStructure);
	
                /* Add this new Stem Object to the hash table of all stems */
                Vector<Stem> v = null;
        
                /* if there is more than one stem with the same graph */
                if (hashOfStems.containsKey(stem.graph)) {
                    /* Get vector and add new suffix to end of it */
                    v = hashOfStems.get(stem.graph);
                    v.addElement(stem);
                } else {
                    /* Add suffix to vector */
                    v = new Vector<Stem>();
                    v.addElement(stem);
                }
	
                /* Put vector of stems into the hash table of all stems */
                hashOfStems.put(stem.graph, v);
	
            } // while reading a line of the file
 
            br.close();

        } // try
        catch (Exception e) {
            e.printStackTrace();
        }
    }        
    
 /** Parse the data file containing the suffixes of Quechua.
     * The file must be in the following format:
     *   Each row represents a single suffix
     *   There are 3 comma (',') delimited fields in each row
     *   The first column is the suffix graph, i.e. the character
     *     string that is the surface realization of the suffix
     *   The second column is the part of speech category of the suffix
     *   The third column contains the f-structure associated with this
     *     suffix that will eventually need to be returned to the
     *     transfer engine. 
     * The 2nd column is optional.  In other words there needs
     *   to be a comma separated 2nd column but there can just be
     *   whitespace for that column
     */      
    public void readSuffixDataFile(String suffixDataFileName) {
	
        int DEBUG = 0;

        if (DEBUG > 0) {
            System.out.println("reading the suffix data file and building the hashOfSuffixes data structure");
        }
        
        Pattern suffixDataPattern = Pattern.compile("^\\s*([^,\\s]+)\\s*,\\s*([^,\\s]*)\\s*,\\s*(.+)\\s*$");
        Pattern commentPattern =    Pattern.compile("^\\s*#");
        Pattern blankLinePattern =  Pattern.compile("^\\s*$");
        
        try {
            BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(suffixDataFileName)));

            String line;
	    
            while ((line = br.readLine()) != null) {
                
                if (DEBUG > 0) {
                    System.out.print("\n\nline: ");
                    System.out.println(line);
                    System.out.println();
                }
 
                Matcher commentMatcher = commentPattern.matcher(line);
                // lookingAt() returns true if even a prefix of the string matches
                if (commentMatcher.lookingAt()) {
                    if (DEBUG > 0) {
                        System.out.println("\tFound a comment line\n");
                    }
                    continue;  // go on to the next line of the suffix data file
                }
                
                Matcher blankLineMatcher = blankLinePattern.matcher(line);
                if (blankLineMatcher.matches()) {
                    if (DEBUG > 0) {
                        System.out.println("\tFound a Blank line, skipping...\n");
                    }
                    continue;  // go on to the next line of the stem data file
                }                
	
                Matcher suffixDataMatcher = suffixDataPattern.matcher(line);
                Boolean suffixDataWellFormated = suffixDataMatcher.matches();
                if ( ! suffixDataWellFormated) {
                    System.err.print("\nIll-formated line in the suffix data file: ");
                    System.err.println(suffixDataFileName);
                    System.err.println("\n\tEach line in the suffix data file must look like:\n");
                    System.err.println("\t<suffix-graph>, (<suffix-POS>), <f-structure>\n");
                    System.err.println("The offending line is:\n");
                    System.err.print("|");
                    System.err.print(line);
                    System.err.println("|");
                    System.err.println("\nExiting...\n");
                    System.err.flush();
                    System.exit(0);
                }
            
                String suffixGraph = suffixDataMatcher.group(1);
                String suffixPOS   = suffixDataMatcher.group(2);
                String fStructure  = suffixDataMatcher.group(3);
    
                if (DEBUG > 0) {
                    System.out.print("suffixGraph: ");
                    System.out.println(suffixGraph);
                    System.out.print("suffixPOS: ");
                    System.out.println(suffixPOS);
                    System.out.print("fStructure: ");
                    System.out.println(fStructure);
                }
            
                /* Create Suffix Object that corrosponds to the row of the
                 * suffix data file that was just read in.
                 */
                Suffix suffix = new Suffix();
                suffix.setVariables(suffixGraph,
                                    suffixPOS,
                                    fStructure);
	
                /* Add this new Suffix Object to the hash table of all suffixes */
                Vector<Suffix> v = null;
        
                /* if there is more than one suffix with the same graph */
                if (hashOfSuffixes.containsKey(suffix.graph)) {
                    /* Get vector and add new suffix to end of it */
                    v = hashOfSuffixes.get(suffix.graph);
                    v.addElement(suffix);
                } else {
                    /* Add suffix to vector */
                    v = new Vector<Suffix>();
                    v.addElement(suffix);
                }
	
                /* Put vector of suffix into the hash table of all suffixes */
                hashOfSuffixes.put(suffix.graph, v);
	
            } // while reading a line of the file
 
            br.close();

        } // try
        catch (Exception e) {
            e.printStackTrace();
        }
    }
    
    public void parseWordFromStdinToStdout() {
   
        int DEBUG = 1;
        
        Stem stem;
        Suffix suffix;        
        
        System.err.println();
        System.err.println("Waiting for Quechua Words to analyze.");
        System.err.println("  Enter 1 word per line.");
        System.err.println("  To quit pass in the string \"**EXIT**\"");
        System.err.println();
        System.err.println();
        
        try {
            BufferedReader stdin = new BufferedReader(new InputStreamReader(System.in));
        
            Vector<Vector> parsesOfAWord;
            
            String word = stdin.readLine();
            

            
            while( ! word.contains("**EXIT**")) {

                if (DEBUG > 0) {
                    System.err.print("The java morphology analysis code is now analyzing the word: |");
                    System.err.print(word);
                    System.err.println("|");
                }
                
                parsesOfAWord = parseWord(word);
                
                // print the parses of a word to stdout
                for (Vector parse : parsesOfAWord) {
            
                    for (int morphemeIndex=0; 
                         morphemeIndex<parse.size(); 
                         morphemeIndex++) {
                
                        if (morphemeIndex == 0) {
                            stem = (Stem)parse.get(morphemeIndex);
                            System.out.print(stem.fstructure);

                        } else {
                            suffix = (Suffix)parse.get(morphemeIndex);
                            System.out.print(suffix.fstructure);
                        }
                
                        if (morphemeIndex < (parse.size() - 1)) {
                            System.out.print(", ");                           
                        }
                    }
            
                    System.out.print("; ");
                }
                System.out.println();
                System.out.flush();                

                word = stdin.readLine();
                
            } //while(true)        
        } // try
        catch (Exception e) {
            e.printStackTrace();
        }
        
    }
    
    public Vector<Vector> parseWord(String word) {
        
        String potentialStemGraph;
        String suffixGroup;
        Vector<Stem> vectorOfStems;
        
        // can't specify a generic type in pointy braces(<>) because this Vector
        // will contain both Stem Objects and Suffix Objects
        Vector parse;   
        
        Vector<Vector> completeParses = new Vector<Vector>();
        
        
        for (int indexIntoWord=1; 
             indexIntoWord<=word.length(); 
             indexIntoWord++) {
            
            potentialStemGraph = word.substring(0, indexIntoWord);
            
            if (hashOfStems.containsKey(potentialStemGraph)) {
                
                // if no end index is passed to substring() then the returned 
                // substring extends to the end of the String
                suffixGroup = word.substring(indexIntoWord); 
                
                vectorOfStems = hashOfStems.get(potentialStemGraph);
                
                for (Stem stem : vectorOfStems) {
                    
                    parse = new Vector();
                    parse.add(stem);

                    completeParses = parseSuffixGroup(suffixGroup, 
                                                      0, 
                                                      stem.category, 
                                                      parse,
                                                      completeParses);
                }
            }
        }
        
        return completeParses;
    }
    
    public Vector<Vector> parseSuffixGroup(String suffixGroup, 
                                           int beginIndexIntoSuffixGroup, 
                                           String category, 
                                           Vector parse,
                                           Vector<Vector> completeParses) {
        
	String potentialSuffixGraph;
	Vector<Suffix> vectorOfSuffixes;
	
	/* Parse found! */
	if (beginIndexIntoSuffixGroup == suffixGroup.length()) {
                          
            completeParses.add((Vector<Vector>)parse.clone());

	    return completeParses;
	}
	
	for (int endIndexIntoSuffixGroup=beginIndexIntoSuffixGroup + 1; 
             endIndexIntoSuffixGroup<=suffixGroup.length(); 
             endIndexIntoSuffixGroup++) {
	    
            potentialSuffixGraph = suffixGroup.substring(beginIndexIntoSuffixGroup, 
                                                         endIndexIntoSuffixGroup);
	    
	    if (hashOfSuffixes.containsKey(potentialSuffixGraph)) {
                
                vectorOfSuffixes = hashOfSuffixes.get(potentialSuffixGraph);
                
		for (Suffix suffix : vectorOfSuffixes) {

		    if (suffix.category.equals(category) 
                        || suffix.category.length() == 0) {
                        
			parse.addElement(suffix);
                        
                        // Recursion to find the next matching suffix
			completeParses = parseSuffixGroup(suffixGroup,
                                                          endIndexIntoSuffixGroup,
                                                          category,
                                                          parse,
                                                          completeParses);
                        
                        // To find the next in line complete parse remove
                        // the final suffix of the parse just found
			parse.removeElementAt(parse.size()-1);
		    }
		}
	    }
	}
        
        return completeParses;
    }    
    
    
    /**
     * @param args the command line arguments which must look like:
     *
     * <stem-data-file> <suffix-data-file>
     */
    public static void main(String[] args) {
        
        System.err.println();
        System.err.println("Starting up the java implemented Quechua Morphological Analyzer");
        System.err.println();
        
        if (args.length != 2) {
            System.out.print("The command line must contain exactly 2 arguments:");
            System.out.println("\t<stem-data-file> <suffix-data-file>");
            System.out.println("\n\tExiting...");
            System.exit(0);
        }
        
	QuechuaSuffixParser parser = new QuechuaSuffixParser();
	
        String stemDataFileName   = args[0];
        String suffixDataFileName = args[1];
        
        System.err.print("In java analyzer: Reading the stem data file: ");
        System.err.println(stemDataFileName);
        System.err.println();
        parser.readStemDataFile(stemDataFileName);
        
        System.err.print("In Java analyzer: Reading the suffix data file: ");
        System.err.println(suffixDataFileName);
        System.err.println();
        parser.readSuffixDataFile(suffixDataFileName);
        
        parser.parseWordFromStdinToStdout();
        
        System.err.println();
        System.err.println("Exiting peacefully the java implemented Quechua Morphological Analyzer");
        System.err.println();
    }    
    
    
       /* Suffix Class Object */
    public class Suffix {
         
	/* Member Variables */
	public String graph	  = "";		//Form of suffix
	public String category	  = "";		//Verb or Noun or Adjective
	public String fstructure  = "";		//Value
	
	/* Member Function */
	public void setVariables(String g, String c, String v) {
	    this.graph	       = g;
	    this.category      = c;
	    this.fstructure    = v;
	}
    }
    
      /* Stem Class Object */
    public class Stem {
	/* Member Variables */
	public String graph	  = "";		//Form of suffix
	public String category	  = "";		//Verb or Noun or Adjective
	public String fstructure  = "";		//f-structure associated with this stem
	
	/* Member Function */
	public Stem(String graph, String category, String fstructure) {
	    this.graph	       = graph;
	    this.category      = category;
	    this.fstructure    = fstructure;
	}
    }    
}

