package cmonson.morphologyChallengeUtilities;

import java.io.*;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * The Goldstandard answer key for German, even just morphological one, has had bug-fixes
 * since the deadline for EMNLP-07 (March 2007). So the results I report in the final
 * version of the paper may end up slightly different than the results reported in
 * the originally submitted version.
 * 
 * @author cmonson
 *
 */
public class AnalyzeCelexEnglishIntoAMorphoChallengeAnswerKey {
	
	/*
	 * A surface Word form may occur on/in more than one line of a Celex Lexicon file.
	 * This happens when a single word form can be analyzed both as a verb and
	 * as a noun, for example.
	 */
	
	// A celex feature string is something like 'dS', or 'aP', or 'dS,aP'
	private static class CelexWordformData {
		int wordformID;
		String wordform;
		CelexLemmaData lemmaData;
		String celexFeatureString;
	
		private 
		CelexWordformData(
				int wordformID,
				String wordForm, 
				CelexLemmaData lemmaData,  
				String celexFeatureString) {
			
			this.wordformID = wordformID;
			this.wordform = wordForm;
			this.lemmaData = lemmaData;
			this.celexFeatureString = celexFeatureString;
		}
		
		@Override
		public String toString() {
			return wordform + "#" + lemmaData + "#" + celexFeatureString;
		}
		
		@Override
		public int hashCode() {
			int hashCode = wordformID * toString().hashCode();
			return hashCode;
		}
	}
	
	private static class CelexLemmaData {
		String citationForm;
		int lemmaID;
		private String derivationalStructure;
		
		private CelexLemmaData(
				String citationForm, 
				int lemmaID, 
				String derivationalStructure) {
			
			this.citationForm = citationForm;
			this.lemmaID = lemmaID;
			this.derivationalStructure = derivationalStructure;
		}
		
		@Override
		public String toString() {
			return citationForm + "#" + lemmaID + "#" + derivationalStructure;
		}
	}
	
	private static class MorphoChallengeAnswerKeyEntry {
		
		String wordform;
		
		Set<MorphoChallengeAnalysis> morphoChallengeAnalyses =
			new HashSet<MorphoChallengeAnalysis>();
		
		public MorphoChallengeAnswerKeyEntry(String wordForm) {
			this.wordform = wordForm;
		}

		public void 
		addMorphoChallengeAnalysis(MorphoChallengeAnalysis morphoChallengeAnalysis) {
			morphoChallengeAnalyses.add(morphoChallengeAnalysis);
		}

		public MorphoChallengeAnswerKeyEntry toLowerCase() {

			MorphoChallengeAnswerKeyEntry lowerCasedAnswerKeyEntry =
				new MorphoChallengeAnswerKeyEntry(wordform.toLowerCase());
			
			for (MorphoChallengeAnalysis morphoChallengeAnalysis : morphoChallengeAnalyses) {
				
				MorphoChallengeAnalysis lowerCasedMorphoChallengeAnalysis =
					morphoChallengeAnalysis.toLowerCase();
				
				lowerCasedAnswerKeyEntry.addMorphoChallengeAnalysis(
						lowerCasedMorphoChallengeAnalysis);
			}
			
			return lowerCasedAnswerKeyEntry;
		}	
	
		@Override
		public String toString() {			
			String toReturn = "";
			
			toReturn += wordform + "\t";
			
			boolean first = true;
			for (MorphoChallengeAnalysis morphoChallengeAnalysis : morphoChallengeAnalyses) {
				
				if (first) {
					first = false;
				} else {
					toReturn += ", ";
				}
				
				toReturn += morphoChallengeAnalysis.getFeatureString();
			}
			
			return toReturn;
		}

		// place all the MorphoChallengeAnalysis instances belonging to answerKeyEntry1
		// or to answerKeyEntry2 into a single new MorphoChallengeAnswerKeyEntry for
		// the common wordform of answerKeyEntry1 and answerKeyEntry2
		public static 
		MorphoChallengeAnswerKeyEntry combineTwoAnswerKeyEntries(
				MorphoChallengeAnswerKeyEntry answerKeyEntry1, 
				MorphoChallengeAnswerKeyEntry answerKeyEntry2) {
			
			if ( ! answerKeyEntry1.wordform.equals(answerKeyEntry2.wordform)) {
				System.err.println();
				System.err.println("ERROR: Can't combine MorphoChallengeAnswerKeyEntry's");
				System.err.println("  with different 'wordforms': ");
				System.err.println("    " + answerKeyEntry1.wordform);
				System.err.println("    " + answerKeyEntry2.wordform);
				System.err.println();
				
				return null;
			}
			
			MorphoChallengeAnswerKeyEntry combinedAnswerKeyEntry =
				new MorphoChallengeAnswerKeyEntry(answerKeyEntry1.wordform);
			
			for (MorphoChallengeAnalysis morphoChallengeAnalysis : 
						answerKeyEntry1.morphoChallengeAnalyses) {
				
				combinedAnswerKeyEntry.addMorphoChallengeAnalysis(morphoChallengeAnalysis);
			}
			for (MorphoChallengeAnalysis morphoChallengeAnalysis : 
						answerKeyEntry2.morphoChallengeAnalyses) {
		
				combinedAnswerKeyEntry.addMorphoChallengeAnalysis(morphoChallengeAnalysis);
			}
			
			return combinedAnswerKeyEntry;
		}

	}
	
	private static class MorphoChallengeAnalysis {
		
		String wordform;
		
		// a morpho challenge 'feature' is any string, that string
		// could be a stem, it could be a literal morpheme, it could
		// be an abstract string representation of one or more
		// morphosyntactic features.
		//
		// Although the citation form is just another 'feature' as far as
		// Morpho challenge is concerned. I sometimes need to treat the
		// citation form as a special case: like when lower casing a
		// MorphoChallengeAnalysis, and so it is stored separately.
		String citationForm;
		List<String> inflectionalFeatures = new ArrayList<String>();
		List<String> derivationalFeatures = new ArrayList<String>();
		
		
		public MorphoChallengeAnalysis(
				String wordForm, 
				String citationForm, 
				List<String> inflectionalFeatures, 
				List<String> derivationalFeatures) {
			
			this.wordform = wordForm;
			this.citationForm = citationForm;
			this.inflectionalFeatures = inflectionalFeatures;
			this.derivationalFeatures = derivationalFeatures;
		}
		
		public static List<MorphoChallengeAnalysis> 
		convertToListOfMorphoChallengeAnalyses(
				Set<CelexWordformData> celexWordformDatasThatCorrespondToSurfaceForm, 
				String surfaceForm) {
			
			List<MorphoChallengeAnalysis> morphoChallengeAnalyses =
				new ArrayList<MorphoChallengeAnalysis>();
			
			Set<String> celexFeatureStrings = new HashSet<String>();
			for (CelexWordformData celexWordformData : 
											celexWordformDatasThatCorrespondToSurfaceForm) {
				celexFeatureStrings.add(celexWordformData.celexFeatureString);
			}
			
			for (CelexWordformData celexWordformData : 
											celexWordformDatasThatCorrespondToSurfaceForm) {
				
				String celexFeatureString = celexWordformData.celexFeatureString;
				
				ArrayList<String> features = new ArrayList<String>();
				
				features = 
					celexFeatureCombinationToMorphoChallengeFeatures(
							celexFeatureString,
							celexFeatureStrings,
							celexWordformData,
							surfaceForm);
				
				// features will be null if we should NOT generate ANY interpretation
				// of this wordform from the current 'celexFeatureString'
				if (features != null) {

					// for an answer key that includes derivational features:
					// 
					//ArrayList<String> derivationalFeatures = 
					//	getDerivationalFeatures(celexWordformData.lemmaData);
					
					// for an answer key that only includes inflectional features:
					//
					ArrayList<String> derivationalFeatures = new ArrayList<String>();
					derivationalFeatures.add(celexWordformData.lemmaData.citationForm);
					
					String citationForm = celexWordformData.lemmaData.citationForm;
					
					MorphoChallengeAnalysis morphoChallengeAnalysis = 
						new MorphoChallengeAnalysis(
								surfaceForm,
								citationForm,
								features,
								derivationalFeatures);

					// For some idiotic reason, and becuase English is so morphologically
					// poor, sometimes two Celex wordforms end up giving rise to identical
					// morphoChallengeAnslysis instances. For example, all adjectives are
					// also in the lexicon as adverbs (see 'bigger' for example.) But
					// we don't want duplicates in the Morpho Challenge style answer key.
					if ( ! morphoChallengeAnalyses.contains(morphoChallengeAnalysis)) {
						morphoChallengeAnalyses.add(morphoChallengeAnalysis);
					}
				}
				
			}
			
			return morphoChallengeAnalyses;
		}
		

		/*
		 * Convert one coherent Celex feature group, such as 'a1S', into
		 * features that closely mimic the official Morpho Challenge 2007 gold standard.
		 * 
		 * 'celexFeatureString' holds the one coherent Celex feature group.
		 * Sometimes the Morpho Challenge 2007 output features depend on what other
		 * Celex feature sets this Celex wordform can mark, so we pass in all the
		 * other feature sets this Celex wordform can mark as 'celexFeatureStringsAsSet'
		 * 'celexWordformData' is only used to print out helpful debugging.
		 */
		private static ArrayList<String> 
		celexFeatureCombinationToMorphoChallengeFeatures(
				String celexFeatureString, 
				Set<String> celexFeatureStringsAsSet, 
				CelexWordformData celexWordformData,
				String surfaceForm) {
			
			ArrayList<String> features = new ArrayList<String>();
			
			// Verbs
			//
			// Past
			if (celexFeatureString.matches(".*a.*")) {
				
				// past participle
				if (celexFeatureString.matches("pa")) {
					features.add("+PCP2");
					
				}
				
				// Plain Past. There is a separate wordform entry for each person number
				// combination for a surface form like 'abandoned'. But we only want
				// to print out ONE of these in the Morpho Challenge style answer key.
				if (celexFeatureStringsAsSet.contains("a1S")) {
					if (celexFeatureString.matches("a1S")) {
						features.add("+PAST");
					
					} else {
						return null;
					}
				} else if (celexFeatureStringsAsSet.contains("a2S")) {
					if (celexFeatureString.matches("a2S")) {
						features.add("+PAST");
					
					} else {
						return null;
					}
				} else if (celexFeatureStringsAsSet.contains("a3S")) {
					if (celexFeatureString.matches("a3S")) {
						features.add("+PAST");
					
					} else {
						return null;
					}
				} else if (celexFeatureStringsAsSet.contains("aP")) {
					if (celexFeatureString.matches("aP")) {
						features.add("+PAST");
					
					} else {
						return null;
					}
				}
			
			// Present
			} else if (celexFeatureString.matches(".*e.*")) {

				// present participle
				if (celexFeatureString.matches("pe")) {
					features.add("+PCP1");
					
				}

				
				if (celexFeatureString.matches("e3S")) {
					features.add("+3SG");
					
				}
				
				// Plain Present. There is a separate wordform entry for each person
				// number combination for a surface form like 'abandon'. But we only want
				// to print out ONE of these in the Morpho Challenge style answer key.
				if (celexFeatureStringsAsSet.contains("i")) {
					if ( ! celexFeatureString.matches("i")) {
						return null;
					}
					
				} else if (celexFeatureStringsAsSet.contains("e1S")) {
					if ( ! celexFeatureString.matches("e1S")) {
						return null;
					}
					
				} else if (celexFeatureStringsAsSet.contains("e2S")) {
					if ( ! celexFeatureString.matches("e2S")) {
						return null;
					}
					
				} else if (celexFeatureStringsAsSet.contains("eP")) {
					if ( ! celexFeatureString.matches("eP")) {
						return null;
					}
				}
			
			// Adjectives
			} else if (celexFeatureString.matches("b")) {
				// positive adjective. No morpheme. So add no features
				
			} else if (celexFeatureString.matches("c")) {
				features.add("+CMP");
				
			} else if (celexFeatureString.matches("s")) {
				features.add("+SUP");
			
				
			// Nouns
			} else if (celexFeatureString.matches("P")) {
				features.add("+PL");
				
			}
			
			// and don't forget genetive
			if (surfaceForm.matches(".*'s?")) {
				features.add("+GEN");
			}
			
			return features;
		}
			
		

		// The derivationalFeatures may be empty
		@SuppressWarnings("unused")
		private static ArrayList<String> getDerivationalFeatures(CelexLemmaData lemmaData) {
			ArrayList<String> derivationalFeatures = new ArrayList<String>();
			
			String derivationalStruture = lemmaData.derivationalStructure;
	
			//    \\(        match a '('
			//    (\\w+)     group 1 - a stem or derivational morpheme
			//    \\)        match '('
			//    \\[        match '['
			//    ([^\\]]+)  group 2 - a sequence of characters that are NOT ']' - the POS
			//    \\]        match ']'
			// 
			Pattern morphemeAndPOSPattern = Pattern.compile("\\((\\w+)\\)\\[([^\\]]+)\\]"); 
			Matcher citationFormMatcher = morphemeAndPOSPattern.matcher(derivationalStruture);
			
			boolean found = true;
			while (found) {
				found = citationFormMatcher.find();
				if (found) {
					String morpheme = citationFormMatcher.group(1);
					String POSString = citationFormMatcher.group(2);
					String derivationalFeature = morpheme;
					if (POSString.contains("|")) {
						// so as not to confuse a derivational suffix with a stem
						derivationalFeature += "_s";  
					} else {
						derivationalFeature += "_" + POSString;
					}
					derivationalFeatures.add(derivationalFeature);
				}
			}
			
			// If no derivational features are specified in Celex, then
			// for Morpho Challenge 2007 the 'derivational features' are just
			// the citatation form
			if (derivationalFeatures.size() == 0) {
				derivationalFeatures.add(lemmaData.citationForm);
			}

			return derivationalFeatures;
		}

		public String getFeatureString() {
			String featureString = "";
			featureString += getDerivationalFeatureString();
			featureString += getInflectionalFeatureString();
			
			return featureString;
		}
		
		public String getInflectionalFeatureString() {
			String inflectionalFeatureString = "";
			
			for (String feature : inflectionalFeatures) {
				inflectionalFeatureString += " ";
				inflectionalFeatureString += feature;
			}
			
			return inflectionalFeatureString;
		}

		public String getDerivationalFeatureString() {
			String derivationalFeatureString = "";
			
			boolean first = true;
			for (String feature : derivationalFeatures) {
				if (first) {
					first = false;
				} else {
					derivationalFeatureString += " ";
				}
				derivationalFeatureString += feature;
			}
			
			return derivationalFeatureString;
		}
		
		public MorphoChallengeAnalysis toLowerCase() {

			MorphoChallengeAnalysis lowerCasedMorphoChallengeAnalysis =
				new MorphoChallengeAnalysis(
						wordform.toLowerCase(),
						citationForm.toLowerCase(),
						inflectionalFeatures,
						derivationalFeatures);
			
			return lowerCasedMorphoChallengeAnalysis;
		}
		
		@Override
		public String toString() {
			String toReturn = "";
			toReturn += wordform;
			toReturn += " ";
			toReturn += getDerivationalFeatureString();
			toReturn += getInflectionalFeatureString();
			return toReturn;
		}
		
		@Override
		public int hashCode() {
			return citationForm.hashCode() * inflectionalFeatures.hashCode();
		}
		
		@Override
		public boolean equals(Object o) {
			if ( ! (o instanceof MorphoChallengeAnalysis)) {
				return false;
			}
			MorphoChallengeAnalysis that = (MorphoChallengeAnalysis)o;
			
			if ( ! this.wordform.equals(that.wordform)) {
				return false;
			}
			
			if ( ! this.citationForm.equals(that.citationForm)) {
				return false;
			}
			
			if ( ! this.inflectionalFeatures.containsAll(that.inflectionalFeatures)) {
				return false;
			}
			
			if ( ! that.inflectionalFeatures.containsAll(this.inflectionalFeatures)) {
				return false;
			}

			if ( ! this.derivationalFeatures.containsAll(that.derivationalFeatures)) {
				return false;
			}
			
			if ( ! that.derivationalFeatures.containsAll(this.derivationalFeatures)) {
				return false;
			}

			return true;
		}


	}
	
	private BufferedReader englishTypes_BufferedReader;
	private BufferedReader celexLemmaLexicon_BufferedReader;
	private BufferedReader celexWordformsLexicon_BufferedReader;
	private PrintWriter morphoChallengeAnswerKey_PrintWriter;
	
	// The surface forms listed in the Celex English Corpus Types file
	Set<String> surfaceForms = new HashSet<String>();
	
	// The Celex Lemma lexicon is initially read into this data structure.
	// Then while reading the Celex Wordform lexicon, the lemma info is
	// added into the info for each wordform
	private Map<Integer, CelexLemmaData> celexLemmaLexiconByLemmaID = 
		new TreeMap<Integer, CelexLemmaData>();
	
	private Map<String, Set<CelexWordformData>> celexWordformsByWordform = 
		new TreeMap<String, Set<CelexWordformData>>();
	
	private Map<String, MorphoChallengeAnswerKeyEntry> 
		morphoChallengeAnswerKeyEntriesByWordform =
			new TreeMap<String, MorphoChallengeAnswerKeyEntry>();
	
	
	public 
	AnalyzeCelexEnglishIntoAMorphoChallengeAnswerKey(
			File englishCorpusTypesFile,
			File celexLemmaLexiconFile,
			File celexWordFormsLexiconFile,
			File morphoChallengeAnswerKeyFile) throws IOException {
		
		englishTypes_BufferedReader =
			openFileForReading(englishCorpusTypesFile);
		celexLemmaLexicon_BufferedReader =
			openFileForReading(celexLemmaLexiconFile);
		celexWordformsLexicon_BufferedReader =
			openFileForReading(celexWordFormsLexiconFile);
		
		morphoChallengeAnswerKey_PrintWriter = 
			openFileForWriting(morphoChallengeAnswerKeyFile, "ISO-8859-1"); // latin-1
		
		readDataFiles();
	}

	private void readDataFiles() throws IOException {
		readEnglishTypes();
		readCelexLemmaLexicon();
		readCelexWordformLexicon();
	}
	
	private void readEnglishTypes() throws NumberFormatException, IOException {
		System.err.println();
		System.err.println("Reading the English Types File...");
		System.err.println();
		
		int lineCounter = 0;
		String lineFromEnglishTypesFile;
		while ((lineFromEnglishTypesFile = 
			englishTypes_BufferedReader.readLine()) != null) {
			
			// skip blank lines
			if (lineFromEnglishTypesFile.matches("^\\s*$")) {
				continue;
			}
			
			lineCounter++;
			if ((lineCounter%10000) == 0) {
				System.err.println("  " + lineCounter + " " + lineFromEnglishTypesFile);
				System.err.flush();
			}
		
			// The English Types file is the list of English words supplied by the
			// MorphoChallenge 2007. This list of words contains words with 'Genetive'
			// markers, 's, or just, ', attached. Celex itself does not list words
			// with these quasi-morphological markers attached, but MorphoChallenge 2007
			// does include them.
			Pattern surfaceFormPattern = Pattern.compile("^\\S+\\s+(\\S+).*$"); 
			Matcher surfaceFormMatcher = surfaceFormPattern.matcher(lineFromEnglishTypesFile);
			boolean matches = surfaceFormMatcher.matches();
		
			if ( ! matches) {
				System.err.println("The English types file is bad");
				System.err.println(" line: " + lineFromEnglishTypesFile);
				System.err.println("  EXITING...");
				System.err.println();
				System.exit(0);
			}
			
			String surfaceForm = surfaceFormMatcher.group(1);
			surfaceForms.add(surfaceForm);
		}
		
	}

	private void readCelexLemmaLexicon() throws IOException {
		System.err.println();
		System.err.println("Reading the English Celex Lemma Lexicon...");
		System.err.println();
		
		int lineCounter = 0;
		String lineFromCelexLemmaLexicon;
		while ((lineFromCelexLemmaLexicon = celexLemmaLexicon_BufferedReader.readLine()) != null) {
			
			// skip blank lines
			if (lineFromCelexLemmaLexicon.matches("^\\s*$")) {
				continue;
			}
			
			lineCounter++;
			if ((lineCounter%10000) == 0) {
				System.err.println("  " + lineCounter + " " + lineFromCelexLemmaLexicon);
				System.err.flush();
			}
			
			// Each line is a list of '\' separated fields. 
			// I am interested in the 1st, 2nd, and 22nd fields:
			//
			// 1)  lemmaID
			// 2)  citation form
			//  ...
			// 22) derivational structure, including all the smallest stems and
			//     derivational morphemes. The derivational structure may be empty
			//  ...
			//
			// Some of the fields between 2 and 22 may be empty
			
			// Insanely, to match a single '\' character in a file you need 4 '\' 
			// characters in the Pattern String. Two of them disappear because
			// of the string literal, and the two remaining match a '\' within
			// the regular expression.
			//
			Pattern citationFormPattern =             
				//                lemmaID \ cit. form \ 19 unneeded fields\ deriv. struct. \... 
				Pattern.compile("^(\\d+)\\\\([^\\\\]+)\\\\([^\\\\]*\\\\){19}([^\\\\]*).*$"); 
			Matcher citationFormMatcher = 
				citationFormPattern.matcher(lineFromCelexLemmaLexicon);
			boolean matches = citationFormMatcher.matches();
		
			if ( ! matches) {
				System.err.println("The Celex Lexicon file is bad");
				System.err.println(" line: " + lineFromCelexLemmaLexicon);
				System.err.println("  EXITING...");
				System.err.println();
				System.exit(0);
			}
			
			String lemmaIDAsString = citationFormMatcher.group(1);
			int lemmaID = Integer.valueOf(lemmaIDAsString);
			
			String citationForm = citationFormMatcher.group(2);
			// group(3) is the 19 unneeded fields
			String derivationalStructure = citationFormMatcher.group(4);
			
			// The Official Morphology Challenge English answer key is lowercased.  
			// But I lower case later, when converting from Celex format to
			// Morpho Challenge 2007 format.
			
			CelexLemmaData celexLemmaData = 
				new CelexLemmaData(
						citationForm, 
						lemmaID,
						derivationalStructure);
			celexLemmaLexiconByLemmaID.put(lemmaID, celexLemmaData);
		}
	}
	
	private void readCelexWordformLexicon() throws NumberFormatException, IOException {
		System.err.println(); 
		System.err.println("Reading the English Celex Wordform Lexicon...");
		System.err.println();
		
		int lineCounter = 0;
		String lineFromCelexWordformLexicon;
		while ((lineFromCelexWordformLexicon = celexWordformsLexicon_BufferedReader.readLine()) != null) {
			
			// skip blank lines
			if (lineFromCelexWordformLexicon.matches("^\\s*$")) {
				continue;
			}
			
			lineCounter++;
			if ((lineCounter%10000) == 0) {
				System.err.println("  " + lineCounter + " " + lineFromCelexWordformLexicon);
				System.err.flush();
			}
			
			// Each line is a list of 6 '\' separated fields. 
			// I am interested in the first, second, fourth, and fifth fields:
			//
			// 1) wordform ID
			// 2) wordform
			// 3) count of wordform in the 'Mannheim' corpus, useless
			// 4) lemmaID, to link this wordform up with its lemma/citation form
			// 5) celex English feature codes
			// 6) specifies spelling variation that occurs with inflectional morphology
			//
			Pattern wordformPattern = 
				Pattern.compile(
						"^(\\d+)\\\\([^\\\\]+)\\\\(\\d+)\\\\(\\d+)\\\\([^\\\\]+)\\\\(.*)$"); 
			Matcher wordformMatcher = 
				wordformPattern.matcher(lineFromCelexWordformLexicon);
			boolean matches = wordformMatcher.matches();
		
			if ( ! matches) {
				System.err.println("The Celex Wordform lexiconfile is bad");
				System.err.println(" line: " + lineFromCelexWordformLexicon);
				System.err.println("  EXITING...");
				System.err.println();
				return;
			}
			
			String wordformIDAsString = wordformMatcher.group(1);
			int wordformID = Integer.valueOf(wordformIDAsString);
			
			String wordform = wordformMatcher.group(2);
			// skip wordforms containing a space
			if (wordform.matches("^.*\\s.*$")) {
				continue;
			}
			
			String lemmaIDAsString = wordformMatcher.group(4);
			int lemmaID = Integer.valueOf(lemmaIDAsString);
			CelexLemmaData celexLemmaDataOfWordForm = celexLemmaLexiconByLemmaID.get(lemmaID);
					
			String celexFeatureCodesAsString = wordformMatcher.group(5);
			
			if ( ! celexWordformsByWordform.containsKey(wordform)) {
				celexWordformsByWordform.put(wordform, new HashSet<CelexWordformData>());
			}
			Set<CelexWordformData> celexWordforms = celexWordformsByWordform.get(wordform);
			
			CelexWordformData celexWordformData = 
				new CelexWordformData(
						wordformID, 
						wordform, 
						celexLemmaDataOfWordForm,
						celexFeatureCodesAsString);
			
			celexWordforms.add(celexWordformData);			
		}
	}

	public BufferedReader openFileForReading(File fileToOpen) {
		
		BufferedReader bufferedReaderToReturn = null;
		
		try {
			bufferedReaderToReturn = 
				new BufferedReader(
						new InputStreamReader(
								new FileInputStream(fileToOpen),
							    "ISO-8859-1")); //latin 1
		}
		catch(FileNotFoundException e) {	
			System.err.println();
			System.err.println("  Sorry.  The file: " + fileToOpen.getAbsolutePath());
			System.err.println("    could not be read.  Here is the full Java error:");
			System.err.println();
			System.err.println(e.getMessage());
			System.err.println();
			System.err.println("  Did NOT successfully set the corpus path.");
			System.err.println();
			System.exit(0);
		}
		catch(Exception e) {
			System.err.println();
			System.err.println("  Sorry.  While opening the file: " + fileToOpen.getAbsolutePath());
			System.err.println("    an error was encountered.  Here is the full Java error:");
			System.err.println();
			System.err.println(e.getMessage());
			System.err.println();
			System.err.println("  Did NOT successfully set the corpus path.");
			System.err.println();
			System.exit(0);
		}
		
		return bufferedReaderToReturn;
	}
	
	private PrintWriter openFileForWriting(File fileToOpen, String encoding) {

		PrintWriter printWriterToReturn = null;
		
		try {
			printWriterToReturn = 
				new PrintWriter(
						new BufferedWriter(
								new OutputStreamWriter(
										new FileOutputStream(fileToOpen),
										encoding)),
						true); // true to autoflush
			
		} catch (FileNotFoundException e) {
			System.err.println();
			System.err.println("Cannot set the output file:");
			System.err.println("  " + fileToOpen.getAbsolutePath());
			System.err.println();
			System.exit(0);

		} catch (IOException e) {
			System.err.println("Failed to open the output file because");
			System.err.println("  of the following internal error:");
			e.printStackTrace();
			System.err.println();
			System.exit(0);
		}
		
		return printWriterToReturn;
	}

	
	/**
	 * @param args the command line arguments which must look like:
	 *
	 * <stem-data-file> <suffix-data-file>
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		if (args.length != 4) {
			System.out.println("The command line must look like:");
			System.out.println(
			 "    java AnalyzeCelexEnglishIntoAMorphoChallengeAnswerKey " + String.format("%n") +
			 "        <path-to-EnglishCorpusTypesFile> " + String.format("%n") +
			 "        <path-to-CelexLemmaLexiconFile> " + String.format("%n") +
			 "        <path-to-CelexWordformsLexiconFile>" + String.format("%n") +
			 "        <path-to-MorphoChallengeAnswerKeyOutputFile>" + String.format("%n%n") +
			 
			 "        The English Types file is the list of English words supplied by the" + String.format("%n") +
			 "          MorphoChallenge 2007. This list of words contains words with 'Genetive'" + String.format("%n") +
			 "          markers, 's, or just, ', attached. Celex itself does not list words" + String.format("%n") +
			 "          with these quasi-morphological markers attached, but MorphoChallenge 2007" + String.format("%n") +
			 "          does include them." + String.format("%n%n"));

			System.out.println("    Exiting...");
			System.out.println();
			System.out.println();
			System.exit(0);
		}
		
		AnalyzeCelexEnglishIntoAMorphoChallengeAnswerKey converter = 
			new AnalyzeCelexEnglishIntoAMorphoChallengeAnswerKey(
					new File(args[0]), 
					new File(args[1]),
					new File(args[2]),
					new File(args[3]));
		
		converter.convertFromCelexToMorphoChallenge();
		converter.lowercaseMorphoChallengeAnswerKeyEntries();
		converter.writeMorphoChallengeAnswerKeys();
	}    
	
	public void convertFromCelexToMorphoChallenge() {
		System.err.println();
		System.err.println("Converting from Celex style features to Morphology Challenge Style answer key format");
		System.err.println();
		
		int surfaceFormCounter = 0;
		for (String surfaceForm : surfaceForms) {
		//String surfaceForm = "abacus'";

			surfaceFormCounter++;
			if ((surfaceFormCounter % 10000) == 0) {
				System.err.println("  " + surfaceFormCounter + " surface forms converted");
			}

			Set<CelexWordformData> wordformDatasThatCorrespondToSurfaceForm =
				getWordformDatasThatCorrespondTo(surfaceForm);

			List<MorphoChallengeAnalysis> 
			morphoChallengeAnalysesForThisCelexWordformData =
				MorphoChallengeAnalysis.convertToListOfMorphoChallengeAnalyses(
						wordformDatasThatCorrespondToSurfaceForm,
						surfaceForm);

			for (MorphoChallengeAnalysis morphoChallengeAnalysis : 
				morphoChallengeAnalysesForThisCelexWordformData) {

				String wordform = morphoChallengeAnalysis.wordform;
				if ( ! morphoChallengeAnswerKeyEntriesByWordform.containsKey(wordform)) {

					morphoChallengeAnswerKeyEntriesByWordform.put(
							wordform,
							new MorphoChallengeAnswerKeyEntry(wordform));
				}
				MorphoChallengeAnswerKeyEntry morphoChallengeAnswerKeyEntry =
					morphoChallengeAnswerKeyEntriesByWordform.get(wordform);

				morphoChallengeAnswerKeyEntry.addMorphoChallengeAnalysis(
						morphoChallengeAnalysis);
			}


		}
	}
	
	private Set<CelexWordformData> getWordformDatasThatCorrespondTo(String surfaceForm) {
		
		// If surfaceForm is just a simple wordform in the celex wordform lexicon
		// then all is good.
		if (celexWordformsByWordform.containsKey(surfaceForm)) {
			return celexWordformsByWordform.get(surfaceForm);
		}
		
		// Otherwise, surfaceForm could be a possessed/genetive form. Or it could
		// match nothing
		Set<CelexWordformData> correspondingWordformDatas = 
			new HashSet<CelexWordformData>();
		
		// Determine if surfaceForm is a genetive form
		Pattern genetivePattern = Pattern.compile("(^.*)('s?)");
		Matcher genetiveMatcher = genetivePattern.matcher(surfaceForm);
		boolean matches = genetiveMatcher.matches();
		
		// If surfaceForm is not genetive, then surfaceForm corresponds to no
		// wordformData's in Celex.
		if ( ! matches) {
			return correspondingWordformDatas;
		}
		
		String wordform = genetiveMatcher.group(1);
		
		// If surfaceForm looks like a genetive but for some reason isn't, then
		// we again can't match surfaceForm to any wordform in Celex.
		if ( ! celexWordformsByWordform.containsKey(wordform)) {
			return correspondingWordformDatas;
		}
		
		Set<CelexWordformData> tempCorrespondingWordformDatas = 
			celexWordformsByWordform.get(wordform);

		// The Morpho Challenge 2007 answer key assumes that genetives can only 
		// be formed out of nouns. So, among those celexWordformData that initially
		// correspond to surfaceForm only keep those that are nouns according to the
		// derivationalStructure. Sometimes the derivationalStructure is not specified,
		// in which case always allow a genetive analysis. (Somehow the official
		// Morpho Challenge 2007 goldstandard was able to specify a part of speech for
		// ALL wordforms--but as far as I can tell, POS information is simply not part
		// of all lemma entries. so I do the next best thing.)
		for (CelexWordformData celexWordformData : tempCorrespondingWordformDatas) {
			String lemmaDataDerivation = celexWordformData.lemmaData.derivationalStructure;
			if ((lemmaDataDerivation.length() == 0) ||
				(lemmaDataDerivation.matches("^.*\\[N\\]"))) {
				correspondingWordformDatas.add(celexWordformData);
			}
		}
		
		return correspondingWordformDatas;
	}

	// Sadly, the German corpus I have (from the Morphology Challenge 2007) is all
	// lower case. Hence the Morphology Challenge answer key needs to also be in
	// lower case. Morhology Challenge answer key entries may *collide* when they are
	// lower cased. So this must be handled.
	private void lowercaseMorphoChallengeAnswerKeyEntries() {
		
		System.err.println();
		System.err.println("Lower casing the MorpholChallengeAnswerKeyEntries");
		System.err.println();
		
		Map<String, MorphoChallengeAnswerKeyEntry> lowerCasedEntriesByLowerCasedWordform =
			new TreeMap<String, MorphoChallengeAnswerKeyEntry>();
		
		int answerKeyEntryCounter = 0;
		for (String caseSensitiveWordform : 
				morphoChallengeAnswerKeyEntriesByWordform.keySet()) {
			
			answerKeyEntryCounter++;
			if ((answerKeyEntryCounter % 10000) == 0) {
				System.err.println(
						answerKeyEntryCounter + 
						" Morphology Challenge Answer Key Entries have been lower cased");
			}
			
			String lowerCasedWordform = caseSensitiveWordform.toLowerCase();
			
			MorphoChallengeAnswerKeyEntry caseSensitiveAnswerKeyEntry = 
				morphoChallengeAnswerKeyEntriesByWordform.get(caseSensitiveWordform);
			
			MorphoChallengeAnswerKeyEntry lowerCasedAnswerKeyEntry =
				caseSensitiveAnswerKeyEntry.toLowerCase();
			
			if ( ! lowerCasedEntriesByLowerCasedWordform.containsKey(lowerCasedWordform)) {
				lowerCasedEntriesByLowerCasedWordform.put(
						lowerCasedWordform, 
						lowerCasedAnswerKeyEntry);
				
			// We must combine answer key entries that collided during case lowering
			} else { 
				
				MorphoChallengeAnswerKeyEntry squattingAnswerKeyEntry =
					lowerCasedEntriesByLowerCasedWordform.get(lowerCasedWordform);
				
				MorphoChallengeAnswerKeyEntry compositeAnswerKeyEntry =
					MorphoChallengeAnswerKeyEntry.combineTwoAnswerKeyEntries (
							squattingAnswerKeyEntry,
							lowerCasedAnswerKeyEntry);
								
				lowerCasedEntriesByLowerCasedWordform.put(
						lowerCasedWordform,
						compositeAnswerKeyEntry);
			}
		}
		
		morphoChallengeAnswerKeyEntriesByWordform = lowerCasedEntriesByLowerCasedWordform;
	}

	private void writeMorphoChallengeAnswerKeys() {
		
		System.err.println();
		System.err.println("Writing out the Morphology Challenge answer key");
		System.err.println();
		
		int wordformCounter = 0;
		for (MorphoChallengeAnswerKeyEntry morphoChallengeAnswerKeyEntry : 
			morphoChallengeAnswerKeyEntriesByWordform.values()) {
			
			wordformCounter++;
			if ((wordformCounter % 10000) == 0) {
				System.err.println("  " + wordformCounter + " wordforms written");
			}
			
			morphoChallengeAnswerKey_PrintWriter.println(
					morphoChallengeAnswerKeyEntry.toString());
		}
	}	
}

