package quechua_morphology_analysis; import java.io.*; import java.util.*; import java.util.regex.*; /** * * @author cmonson */ public class QuechuaSuffixParser { /* Member Variables */ private Hashtable> hashOfSuffixes = null; //Hash table of Suffixes private Hashtable> hashOfStems = null; //Hash table of Stems /** Creates a new instance of QuechuaSuffixParser */ public QuechuaSuffixParser() { hashOfSuffixes = new Hashtable>(); hashOfStems = new Hashtable>(); } /** Parse the data file containing the stems of Quechua. * The file must be in the following format: * Each row represents a single stem * There are 3 comma (',') delimited fields in each row * The first column is the stem graph, i.e. the character * string that is the surface realization of the stem * The second column is the part of speech category of the stem * The third column contains the f-structure associated with this * suffix that will eventually need to be returned to the * transfer engine. */ public void readStemDataFile(String stemDataFileName) { int DEBUG = 0; if (DEBUG > 0) { System.out.println("reading the stem data file and building the hashOfStems data structure"); } Pattern stemDataPattern = Pattern.compile("^\\s*([^,\\s]+)\\s*,\\s*([^,\\s]+)\\s*,\\s*(.*)\\s*$"); Pattern commentPattern = Pattern.compile("^\\s*#"); Pattern blankLinePattern = Pattern.compile("^\\s*$"); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(stemDataFileName))); String line; while ((line = br.readLine()) != null) { if (DEBUG > 0) { System.out.print("\n\nline: "); System.out.println(line); System.out.println(); } Matcher commentMatcher = commentPattern.matcher(line); // lookingAt() returns true if even a prefix of the string matches if (commentMatcher.lookingAt()) { if (DEBUG > 0) { System.out.println("\tFound a comment line, skipping...\n"); } continue; // go on to the next line of the stem data file } Matcher blankLineMatcher = blankLinePattern.matcher(line); if (blankLineMatcher.matches()) { if (DEBUG > 0) { System.out.println("\tFound a Blank line, skipping...\n"); } continue; // go on to the next line of the stem data file } Matcher stemDataMatcher = stemDataPattern.matcher(line); Boolean stemDataWellFormated = stemDataMatcher.matches(); if ( ! stemDataWellFormated) { System.err.print("\nIll-formated line in the stem data file: "); System.err.println(stemDataFileName); System.err.println("\n\tEach line in the stem data file must look like:\n"); System.err.println("\t, , \n"); System.err.println("The offending line is:\n"); System.err.print("|"); System.err.print(line); System.err.println("|"); System.err.println("\nExiting...\n"); System.err.flush(); System.exit(0); } String stemGraph = stemDataMatcher.group(1); String stemPOS = stemDataMatcher.group(2); String fStructure = stemDataMatcher.group(3); if (DEBUG > 0) { System.out.print("stemGraph: "); System.out.println(stemGraph); System.out.print("stemPOS: "); System.out.println(stemPOS); System.out.print("fStructure: "); System.out.println(fStructure); } /* Create Stem Object that corrosponds to the row of the * suffix data file that was just read in. */ Stem stem = new Stem(stemGraph, stemPOS, fStructure); /* Add this new Stem Object to the hash table of all stems */ Vector v = null; /* if there is more than one stem with the same graph */ if (hashOfStems.containsKey(stem.graph)) { /* Get vector and add new suffix to end of it */ v = hashOfStems.get(stem.graph); v.addElement(stem); } else { /* Add suffix to vector */ v = new Vector(); v.addElement(stem); } /* Put vector of stems into the hash table of all stems */ hashOfStems.put(stem.graph, v); } // while reading a line of the file br.close(); } // try catch (Exception e) { e.printStackTrace(); } } /** Parse the data file containing the suffixes of Quechua. * The file must be in the following format: * Each row represents a single suffix * There are 3 comma (',') delimited fields in each row * The first column is the suffix graph, i.e. the character * string that is the surface realization of the suffix * The second column is the part of speech category of the suffix * The third column contains the f-structure associated with this * suffix that will eventually need to be returned to the * transfer engine. * The 2nd column is optional. In other words there needs * to be a comma separated 2nd column but there can just be * whitespace for that column */ public void readSuffixDataFile(String suffixDataFileName) { int DEBUG = 0; if (DEBUG > 0) { System.out.println("reading the suffix data file and building the hashOfSuffixes data structure"); } Pattern suffixDataPattern = Pattern.compile("^\\s*([^,\\s]+)\\s*,\\s*([^,\\s]*)\\s*,\\s*(.+)\\s*$"); Pattern commentPattern = Pattern.compile("^\\s*#"); Pattern blankLinePattern = Pattern.compile("^\\s*$"); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(suffixDataFileName))); String line; while ((line = br.readLine()) != null) { if (DEBUG > 0) { System.out.print("\n\nline: "); System.out.println(line); System.out.println(); } Matcher commentMatcher = commentPattern.matcher(line); // lookingAt() returns true if even a prefix of the string matches if (commentMatcher.lookingAt()) { if (DEBUG > 0) { System.out.println("\tFound a comment line\n"); } continue; // go on to the next line of the suffix data file } Matcher blankLineMatcher = blankLinePattern.matcher(line); if (blankLineMatcher.matches()) { if (DEBUG > 0) { System.out.println("\tFound a Blank line, skipping...\n"); } continue; // go on to the next line of the stem data file } Matcher suffixDataMatcher = suffixDataPattern.matcher(line); Boolean suffixDataWellFormated = suffixDataMatcher.matches(); if ( ! suffixDataWellFormated) { System.err.print("\nIll-formated line in the suffix data file: "); System.err.println(suffixDataFileName); System.err.println("\n\tEach line in the suffix data file must look like:\n"); System.err.println("\t, (), \n"); System.err.println("The offending line is:\n"); System.err.print("|"); System.err.print(line); System.err.println("|"); System.err.println("\nExiting...\n"); System.err.flush(); System.exit(0); } String suffixGraph = suffixDataMatcher.group(1); String suffixPOS = suffixDataMatcher.group(2); String fStructure = suffixDataMatcher.group(3); if (DEBUG > 0) { System.out.print("suffixGraph: "); System.out.println(suffixGraph); System.out.print("suffixPOS: "); System.out.println(suffixPOS); System.out.print("fStructure: "); System.out.println(fStructure); } /* Create Suffix Object that corrosponds to the row of the * suffix data file that was just read in. */ Suffix suffix = new Suffix(); suffix.setVariables(suffixGraph, suffixPOS, fStructure); /* Add this new Suffix Object to the hash table of all suffixes */ Vector v = null; /* if there is more than one suffix with the same graph */ if (hashOfSuffixes.containsKey(suffix.graph)) { /* Get vector and add new suffix to end of it */ v = hashOfSuffixes.get(suffix.graph); v.addElement(suffix); } else { /* Add suffix to vector */ v = new Vector(); v.addElement(suffix); } /* Put vector of suffix into the hash table of all suffixes */ hashOfSuffixes.put(suffix.graph, v); } // while reading a line of the file br.close(); } // try catch (Exception e) { e.printStackTrace(); } } public void parseWordFromStdinToStdout() { int DEBUG = 1; Stem stem; Suffix suffix; System.err.println(); System.err.println("Waiting for Quechua Words to analyze."); System.err.println(" Enter 1 word per line."); System.err.println(" To quit pass in the string \"**EXIT**\""); System.err.println(); System.err.println(); try { BufferedReader stdin = new BufferedReader(new InputStreamReader(System.in)); Vector parsesOfAWord; String word = stdin.readLine(); while( ! word.contains("**EXIT**")) { if (DEBUG > 0) { System.err.print("The java morphology analysis code is now analyzing the word: |"); System.err.print(word); System.err.println("|"); } parsesOfAWord = parseWord(word); // print the parses of a word to stdout for (Vector parse : parsesOfAWord) { for (int morphemeIndex=0; morphemeIndex parseWord(String word) { String potentialStemGraph; String suffixGroup; Vector vectorOfStems; // can't specify a generic type in pointy braces(<>) because this Vector // will contain both Stem Objects and Suffix Objects Vector parse; Vector completeParses = new Vector(); for (int indexIntoWord=1; indexIntoWord<=word.length(); indexIntoWord++) { potentialStemGraph = word.substring(0, indexIntoWord); if (hashOfStems.containsKey(potentialStemGraph)) { // if no end index is passed to substring() then the returned // substring extends to the end of the String suffixGroup = word.substring(indexIntoWord); vectorOfStems = hashOfStems.get(potentialStemGraph); for (Stem stem : vectorOfStems) { parse = new Vector(); parse.add(stem); completeParses = parseSuffixGroup(suffixGroup, 0, stem.category, parse, completeParses); } } } return completeParses; } public Vector parseSuffixGroup(String suffixGroup, int beginIndexIntoSuffixGroup, String category, Vector parse, Vector completeParses) { String potentialSuffixGraph; Vector vectorOfSuffixes; /* Parse found! */ if (beginIndexIntoSuffixGroup == suffixGroup.length()) { completeParses.add((Vector)parse.clone()); return completeParses; } for (int endIndexIntoSuffixGroup=beginIndexIntoSuffixGroup + 1; endIndexIntoSuffixGroup<=suffixGroup.length(); endIndexIntoSuffixGroup++) { potentialSuffixGraph = suffixGroup.substring(beginIndexIntoSuffixGroup, endIndexIntoSuffixGroup); if (hashOfSuffixes.containsKey(potentialSuffixGraph)) { vectorOfSuffixes = hashOfSuffixes.get(potentialSuffixGraph); for (Suffix suffix : vectorOfSuffixes) { if (suffix.category.equals(category) || suffix.category.length() == 0) { parse.addElement(suffix); // Recursion to find the next matching suffix completeParses = parseSuffixGroup(suffixGroup, endIndexIntoSuffixGroup, category, parse, completeParses); // To find the next in line complete parse remove // the final suffix of the parse just found parse.removeElementAt(parse.size()-1); } } } } return completeParses; } /** * @param args the command line arguments which must look like: * *

*/ public static void main(String[] args) { System.err.println(); System.err.println("Starting up the java implemented Quechua Morphological Analyzer"); System.err.println(); if (args.length != 2) { System.out.print("The command line must contain exactly 2 arguments:"); System.out.println("\t

"); System.out.println("\n\tExiting..."); System.exit(0); } QuechuaSuffixParser parser = new QuechuaSuffixParser(); String stemDataFileName = args[0]; String suffixDataFileName = args[1]; System.err.print("In java analyzer: Reading the stem data file: "); System.err.println(stemDataFileName); System.err.println(); parser.readStemDataFile(stemDataFileName); System.err.print("In Java analyzer: Reading the suffix data file: "); System.err.println(suffixDataFileName); System.err.println(); parser.readSuffixDataFile(suffixDataFileName); parser.parseWordFromStdinToStdout(); System.err.println(); System.err.println("Exiting peacefully the java implemented Quechua Morphological Analyzer"); System.err.println(); } /* Suffix Class Object */ public class Suffix { /* Member Variables */ public String graph = ""; //Form of suffix public String category = ""; //Verb or Noun or Adjective public String fstructure = ""; //Value /* Member Function */ public void setVariables(String g, String c, String v) { this.graph = g; this.category = c; this.fstructure = v; } } /* Stem Class Object */ public class Stem { /* Member Variables */ public String graph = ""; //Form of suffix public String category = ""; //Verb or Noun or Adjective public String fstructure = ""; //f-structure associated with this stem /* Member Function */ public Stem(String graph, String category, String fstructure) { this.graph = graph; this.category = category; this.fstructure = fstructure; } } }