package cmonson.morphologyChallengeUtilities;

import java.io.*;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * The Goldstandard answer key for German, even just morphological one, has had bug-fixes
 * since the deadline for EMNLP-07 (March 2007). So the results I report in the final
 * version of the paper may end up slightly different than the results reported in
 * the originally submitted version. -- see readCelexWordFormLexicon() for example.
 * 
 * But this version of AnalyzeCelexGermanIntoAMorphoChallengeAnswerKey is meant to be
 * a backup that is very close to the that which produced the answer key used in the
 * EMNLP-07 paper.
 * 
 * @author cmonson
 *
 */
public class AnalyzeCelexGermanIntoAMorphoChallengeAnswerKey_versionForEMNLP2007 {
	
	/*
	 * A surface Word form may occur on/in more than one line of a Celex Lexicon file.
	 * This happens when a single word form can be analyzed both as a verb and
	 * as a noun, for example.
	 */

	private static class AllCelexInfoForAWordform {
		String wordform;
		List<CelexAnalysis> celexAnalzses = new ArrayList<CelexAnalysis>();
		
		public AllCelexInfoForAWordform(String wordform) {
			this.wordform = wordform;
		}
		
		public void addCelexAnalysis(CelexAnalysis celexAnalysis) {
			celexAnalzses.add(celexAnalysis);
		}
		
		@Override
		public String toString() {
			String toReturn = "";
			toReturn += wordform;
			for (CelexAnalysis celexAnalysis : celexAnalzses) {
				toReturn += " " + celexAnalysis.toString();
			}
			return toReturn;
		}
	}
	
	// A celex feature string is something like 'dS', or 'aP'
	// so 'dS,aP' is two separate celex feature strings and so 'dS,aP' generates
	// two separate 'CelexAnalysis' instances.
	private static class CelexAnalysis {
		String wordForm;
		CelexLemmaData lemmaData;
		String celexFeatureString;
	
		private 
		CelexAnalysis(
				String wordForm, 
				CelexLemmaData lemmaData,  
				String celexFeatureString) {
			
			this.wordForm = wordForm;
			this.lemmaData = lemmaData;
			this.celexFeatureString = celexFeatureString;
		}
		
		@Override
		public String toString() {
			return wordForm + "#" + lemmaData + "#" + celexFeatureString;
		}
	}
	
	private static class CelexLemmaData {
		String citationForm;
		int lemmaID;
		
		private CelexLemmaData(String citationForm, int lemmaID) {
			this.citationForm = citationForm;
			this.lemmaID = lemmaID;
		}
		
		@Override
		public String toString() {
			return citationForm + "#" + lemmaID;
		}
	}
	
	private static class MorphoChallengeAnswerKeyEntry {
		
		String wordform;
		
		Set<MorphoChallengeAnalysis> morphoChallengeAnalyses =
			new HashSet<MorphoChallengeAnalysis>();
		
		public MorphoChallengeAnswerKeyEntry(String wordForm) {
			this.wordform = wordForm;
		}

		public void 
		addMorphoChallengeAnalysis(MorphoChallengeAnalysis morphoChallengeAnalysis) {
			morphoChallengeAnalyses.add(morphoChallengeAnalysis);
		}

		public MorphoChallengeAnswerKeyEntry toLowerCase() {

			MorphoChallengeAnswerKeyEntry lowerCasedAnswerKeyEntry =
				new MorphoChallengeAnswerKeyEntry(wordform.toLowerCase());
			
			for (MorphoChallengeAnalysis morphoChallengeAnalysis : morphoChallengeAnalyses) {
				
				MorphoChallengeAnalysis lowerCasedMorphoChallengeAnalysis =
					morphoChallengeAnalysis.toLowerCase();
				
				lowerCasedAnswerKeyEntry.addMorphoChallengeAnalysis(
						lowerCasedMorphoChallengeAnalysis);
			}
			
			return lowerCasedAnswerKeyEntry;
		}	
	
		@Override
		public String toString() {			
			String toReturn = "";
			
			toReturn += wordform + "\t";
			
			boolean first = true;
			for (MorphoChallengeAnalysis morphoChallengeAnalysis : morphoChallengeAnalyses) {
				
				if (first) {
					first = false;
				} else {
					toReturn += ", ";
				}
				
				toReturn += morphoChallengeAnalysis.getFeatureString();
			}
			
			return toReturn;
		}

		// place all the MorphoChallengeAnalysis instances belonging to answerKeyEntry1
		// or to answerKeyEntry2 into a single new MorphoChallengeAnswerKeyEntry for
		// the common wordform of answerKeyEntry1 and answerKeyEntry2
		public static 
		MorphoChallengeAnswerKeyEntry combineTwoAnswerKeyEntries(
				MorphoChallengeAnswerKeyEntry answerKeyEntry1, 
				MorphoChallengeAnswerKeyEntry answerKeyEntry2) {
			
			if ( ! answerKeyEntry1.wordform.equals(answerKeyEntry2.wordform)) {
				System.err.println();
				System.err.println("ERROR: Can't combine MorphoChallengeAnswerKeyEntry's");
				System.err.println("  with different 'wordforms': ");
				System.err.println("    " + answerKeyEntry1.wordform);
				System.err.println("    " + answerKeyEntry2.wordform);
				System.err.println();
				
				return null;
			}
			
			MorphoChallengeAnswerKeyEntry combinedAnswerKeyEntry =
				new MorphoChallengeAnswerKeyEntry(answerKeyEntry1.wordform);
			
			for (MorphoChallengeAnalysis morphoChallengeAnalysis : 
						answerKeyEntry1.morphoChallengeAnalyses) {
				
				combinedAnswerKeyEntry.addMorphoChallengeAnalysis(morphoChallengeAnalysis);
			}
			for (MorphoChallengeAnalysis morphoChallengeAnalysis : 
						answerKeyEntry2.morphoChallengeAnalyses) {
		
				combinedAnswerKeyEntry.addMorphoChallengeAnalysis(morphoChallengeAnalysis);
			}
			
			return combinedAnswerKeyEntry;
		}

	}
	
	private static class MorphoChallengeAnalysis {
		
		// A central repository for the official feature names that are output to
		// the morpho challenge style answer key. These constants are particularly
		// important when more than one MACO feature string encodes the same
		// morphosyntactic feature--especially when that morphosyntactic feature is
		// always encoded by the same morpheme, such as 's', 'a', and 'o' which 
		// consistently mark plural, feminine, and masculine respectively accross
		// various MACO parts of speech (adjective, determiner, etc.)
		//
		private static final String SINGULAR      = "+sg";
		private static final String PLURAL        = "+pl";
		
		private static final String NOMINATIVE    = "+nom";
		private static final String ACCUSATIVE    = "+acc";
		private static final String DATIVE        = "+dat";
		private static final String GENATIVE      = "+gen";
		
		private static final String FIRST_PERSON  = "+1st";
		private static final String SECOND_PERSON = "+2nd";
		private static final String THIRD_PERSON  = "+3rd";
		
		private static final String INDICATIVE    = "+indic";
		private static final String SUBJUNCTIVE   = "+subjunc";
		private static final String IMPERATIVE    = "+imperative";
		
		private static final String INFINITIVE    = "+inf";
		private static final String PARTICIPLE    = "+part";
		
		private static final String PRESENT       = "+pres";
		private static final String PAST          = "+past";
		
		//private static final String POSITIVE      = "+positive";
		private static final String COMPARATIVE   = "+comparative";
		private static final String SUPERLATIVE   = "+superlative";
		
		// These are endings that go on adjectives. I think these are case
		// and number endings for when adjectives behave as nouns.  But
		// CELEX unhelpfully just lists them as: 'Suff_e', 'Suff_en', etc.
		private static final String ADJ_SUFF_E    = "+ADJ_e";
		private static final String ADJ_SUFF_EN   = "+ADJ_en";
		private static final String ADJ_SUFF_ER   = "+ADJ_er";
		private static final String ADJ_SUFF_EM   = "+ADJ_em";
		private static final String ADJ_SUFF_ES   = "+ADJ_es";
		private static final String ADJ_SUFF_S    = "+ADJ_s";
		
		String wordForm;
		
		// a morpho challenge 'feature' is any string, that string
		// could be a stem, it could be a literal morpheme, it could
		// be an abstract string representation of one or more
		// morphosyntactic features.
		//
		// Although the citation form is just another 'feature' as far as
		// Morpho challenge is concerned. I sometimes need to treat the
		// citation form as a special case: like when lower casing a
		// MorphoChallengeAnalysis, and so it is stored separately.
		String citationForm;
		List<String> features = new ArrayList<String>();
		
		public MorphoChallengeAnalysis(CelexAnalysis celexAnalysis) { 
			
			this.wordForm = celexAnalysis.wordForm;
			
			// Add the citation form as a MorphoChallenge Feature
			citationForm = celexAnalysis.lemmaData.citationForm;
						
			if (celexAnalysis.celexFeatureString.matches("^.*(n|g|d|a).*$")) {
				 
				// the citation form of nouns carry no inflectional morphemes
				if ( ! celexAnalysis.wordForm.equals(celexAnalysis.lemmaData.citationForm)) {
					incorporateMorphoSyntacticFeaturesMarkedOnNoun(
							celexAnalysis.celexFeatureString);
				}
				
			} else if (celexAnalysis.celexFeatureString.matches("^.*(i|z|p|E|A|1|2|3|I|K|r).*$")) {
				 
				incorporateMorphoSyntacticFeaturesMarkedOnVerb(
						celexAnalysis.celexFeatureString);

			} else if (celexAnalysis.celexFeatureString.matches("^.*(o|c|u).*$")) {
				 
				// the citation form of German adjectives carry no inflectional morphemes
				if ( ! celexAnalysis.wordForm.equals(celexAnalysis.lemmaData.citationForm)) {
					incorporateMorphoSyntacticFeaturesMarkedOnAdjective(
							celexAnalysis.celexFeatureString);
				}
				
			} else if (celexAnalysis.celexFeatureString.matches("^.*X.*$")) {
				// Do nothing
				
			} else {
				//System.err.println(" Not a verb, noun, or adj:");
				//System.err.println();
				//System.err.println(celexAnalysis.toString());
				//System.err.println();
				//System.err.println();
			}
			

		}

		public MorphoChallengeAnalysis(
				String wordForm, 
				String citationForm, 
				List<String> features) {
			
			this.wordForm = wordForm;
			this.citationForm = citationForm;
			this.features = features;
		}

		private void 
		incorporateMorphoSyntacticFeaturesMarkedOnNoun(String celexFeatureString) {
			boolean caseSpecified = false;
			String featureString = "";
			if (celexFeatureString.matches("^.*n.*$")) {
				featureString += NOMINATIVE;
				caseSpecified = true;
			}
			if (celexFeatureString.matches("^.*a.*$")) {
				if (caseSpecified) {
					System.err.println();
					System.err.println(
							"ERROR: Case has already been specified! " + celexFeatureString);
					System.err.println();
				}
				featureString += ACCUSATIVE;
				caseSpecified = true;
			}
			if (celexFeatureString.matches("^.*d.*$")) {
				if (caseSpecified) {
					System.err.println();
					System.err.println(
							"ERROR: Case has already been specified! " + celexFeatureString);
					System.err.println();
				}
				featureString += DATIVE;
				caseSpecified = true;
			}
			if (celexFeatureString.matches("^.*g.*$")) {
				if (caseSpecified) {
					System.err.println();
					System.err.println(
							"ERROR: Case has already been specified! " + celexFeatureString);
					System.err.println();
				}
				featureString += GENATIVE;
				caseSpecified = true;
			}
			
			if ( ! caseSpecified) {
				System.err.println();
				System.err.println("ERROR: No case specified! " + celexFeatureString);
			}

			if (celexFeatureString.matches("^.*P.*$")) {
				String feature = PLURAL.replaceAll("\\+", "_");
				featureString += feature;
			}
			
			features.add(featureString);
		}

		private void 
		incorporateMorphoSyntacticFeaturesMarkedOnVerb(String celexFeatureString) {
			
			// A reasonable statement for German is that only a single verbal
			// suffix can occur on any one verb. And that that suffix marks
			// multiple features, and possibly more than one set of features
			// (i.e. syncretism)
			
			String featureString = "";
			
			// PERSON
			if (celexFeatureString.matches("^.*1.*$")) {
				featureString += FIRST_PERSON;
			}
			if (celexFeatureString.matches("^.*2.*$")) {
				String feature = SECOND_PERSON;
				if (featureString.length() > 0) {
					feature = feature.replaceAll("\\+", "_");
				}
				featureString += feature;
			}
			if (celexFeatureString.matches("^.*3.*$")) {
				String feature = THIRD_PERSON;
				if (featureString.length() > 0) {
					feature = feature.replaceAll("\\+", "_");
				}
				featureString += feature;
			}
			
			// NUMBER
			if (celexFeatureString.matches("^.*S.*$")) {
				String feature = SINGULAR;
				if (featureString.length() > 0) {
					feature = feature.replaceAll("\\+", "_");
				}
				featureString += feature;
			}
			if (celexFeatureString.matches("^.*P.*$")) {
				String feature = PLURAL;
				if (featureString.length() > 0) {
					feature = feature.replaceAll("\\+", "_");
				}
				featureString += feature;
			}

			// MOOD
			if (celexFeatureString.matches("^.*I.*$")) {
				String feature = INDICATIVE;
				if (featureString.length() > 0) {
					feature = feature.replaceAll("\\+", "_");
				}
				featureString += feature;
			}
			if (celexFeatureString.matches("^.*K.*$")) {
				String feature = SUBJUNCTIVE;
				if (featureString.length() > 0) {
					feature = feature.replaceAll("\\+", "_");
				}
				featureString += feature;
			}
			if (celexFeatureString.matches("^.*r.*$")) {
				String feature = IMPERATIVE;
				if (featureString.length() > 0) {
					feature = feature.replaceAll("\\+", "_");
				}
				featureString += feature;
			}
			
			// TENSE
			if (celexFeatureString.matches("^.*E.*$")) {
				String feature = PRESENT;
				if (featureString.length() > 0) {
					feature = feature.replaceAll("\\+", "_");
				}
				featureString += feature;
			}
			if (celexFeatureString.matches("^.*A.*$")) {
				String feature = PAST;
				if (featureString.length() > 0) {
					feature = feature.replaceAll("\\+", "_");
				}
				featureString += feature;
			}
			
			// NON-FINITE FEATURES
			if (celexFeatureString.matches("^.*i.*$")) {
				String feature = INFINITIVE;
				if (featureString.length() > 0) {
					feature = feature.replaceAll("\\+", "_");
				}
				featureString += feature;
			}
			if (celexFeatureString.matches("^.*z.*$")) {  // a 'zu' infinitive 'abzublasen'
				String feature = INFINITIVE;
				if (featureString.length() > 0) {
					feature = feature.replaceAll("\\+", "_");
				}
				featureString += feature;
			}
			if (celexFeatureString.matches("^.*p.*$")) {
				String feature = PARTICIPLE;
				if (featureString.length() > 0) {
					feature = feature.replaceAll("\\+", "_");
				}
				featureString += feature;
			}
			
			features.add(featureString);
		}

		private void 
		incorporateMorphoSyntacticFeaturesMarkedOnAdjective(String celexFeatureString) {
			
			// DEGREE is marked separately from the quasi-nominal endings and
			// so each counts as a separate Morphology Challenge feature.
			
			// DEGREE: POSITIVE, COMPARATIVE, SUPERLATIVE
			//
			// POSITIVE is an unmarked feature, so don't write out a
			// Morphology Challenge feature for it.
			boolean degreeMarked = false;
			if (celexFeatureString.matches("^.*o.*$")) {
				degreeMarked = true;
			}
			if (celexFeatureString.matches("^.*c.*$")) {
				if (degreeMarked) {
					System.err.println();
					System.err.println(
							"ERROR: Degree has already been specified! " + celexFeatureString);
					System.err.println();
				}
				features.add(COMPARATIVE);
				degreeMarked = true;
			}
			if (celexFeatureString.matches("^.*u.*$")) {
				if (degreeMarked) {
					System.err.println();
					System.err.println(
							"ERROR: Degree has already been specified! " + celexFeatureString);
					System.err.println();
				}
				features.add(SUPERLATIVE);
				degreeMarked = true;
			}

			// The special (unhelpful) adjective (nominalized) endings
			// I don't know for certain that these are mutually exclusive.
			if (celexFeatureString.matches("^.*4.*$")) {
				features.add(ADJ_SUFF_E);
			}
			if (celexFeatureString.matches("^.*5.*$")) {
				features.add(ADJ_SUFF_EN);
			}
			if (celexFeatureString.matches("^.*6.*$")) {
				features.add(ADJ_SUFF_ER);
			}
			if (celexFeatureString.matches("^.*7.*$")) {
				features.add(ADJ_SUFF_EM);
			}
			if (celexFeatureString.matches("^.*8.*$")) {
				features.add(ADJ_SUFF_ES);
			}
			if (celexFeatureString.matches("^.*9.*$")) {
				features.add(ADJ_SUFF_S);
			}
		}

		public String getFeatureString() {
			String morphoChallengeAnalysisString = "";
			
			morphoChallengeAnalysisString += citationForm;
			
			for (String feature : features) {
				morphoChallengeAnalysisString += " ";			
				morphoChallengeAnalysisString += feature;
			}
			
			return morphoChallengeAnalysisString;
		}

		
		public MorphoChallengeAnalysis toLowerCase() {

			MorphoChallengeAnalysis lowerCasedMorphoChallengeAnalysis =
				new MorphoChallengeAnalysis(
						wordForm.toLowerCase(),
						citationForm.toLowerCase(),
						features);
			
			return lowerCasedMorphoChallengeAnalysis;
		}
		
		@Override
		public String toString() {
			String toReturn = "";
			toReturn += wordForm;
			toReturn += getFeatureString();
			return toReturn;
		}
		
		@Override
		public int hashCode() {
			return citationForm.hashCode() * features.hashCode();
		}
		
		@Override
		public boolean equals(Object o) {
			if ( ! (o instanceof MorphoChallengeAnalysis)) {
				return false;
			}
			MorphoChallengeAnalysis that = (MorphoChallengeAnalysis)o;
			
			if ( ! this.wordForm.equals(that.wordForm)) {
				return false;
			}
			
			if ( ! this.citationForm.equals(that.citationForm)) {
				return false;
			}
			
			if ( ! this.features.containsAll(that.features)) {
				return false;
			}
			
			if ( ! that.features.containsAll(this.features)) {
				return false;
			}
						
			return true;
		}
	}
	
	private BufferedReader celexLemmaLexicon_BufferedReader;
	private BufferedReader celexWordformsLexicon_BufferedReader;
	private PrintWriter morphoChallengeAnswerKey_PrintWriter;
	
	// The Celex Lemma lexicon is initially read into this data structure.
	// Then while reading the Celex Wordform lexicon, the lemma info is
	// added into the info for each wordform
	private Map<Integer, CelexLemmaData> celexLemmaLexiconByLemmaID = 
		new TreeMap<Integer, CelexLemmaData>();
	
	private Map<String, AllCelexInfoForAWordform> celexLexiconByWordForm = 
		new TreeMap<String, AllCelexInfoForAWordform>();
	
	private Map<String, MorphoChallengeAnswerKeyEntry> 
		morphoChallengeAnswerKeyEntriesByWordform =
			new TreeMap<String, MorphoChallengeAnswerKeyEntry>();
	
	
	public 
	AnalyzeCelexGermanIntoAMorphoChallengeAnswerKey_versionForEMNLP2007(
			File celexLemmaLexiconFile,
			File celexWordFormsLexiconFile,
			File morphoChallengeAnswerKeyFile) throws IOException {
		
		celexLemmaLexicon_BufferedReader =
			openFileForReading(celexLemmaLexiconFile);
		celexWordformsLexicon_BufferedReader =
			openFileForReading(celexWordFormsLexiconFile);
		
		morphoChallengeAnswerKey_PrintWriter = 
			openFileForWriting(morphoChallengeAnswerKeyFile, "ISO-8859-1"); // latin-1
		
		readCelexLexicons();
	}

	private void readCelexLexicons() throws IOException {
		readCelexLemmaLexicon();
		readCelexWordFormLexicon();
	}
	
	private void readCelexLemmaLexicon() throws IOException {
		System.err.println();
		System.err.println("Reading the German Celex Lemma Lexicon...");
		System.err.println();
		
		int lineCounter = 0;
		String lineFromCelexLemmaLexicon;
		while ((lineFromCelexLemmaLexicon = celexLemmaLexicon_BufferedReader.readLine()) != null) {
			
			// skip blank lines
			if (lineFromCelexLemmaLexicon.matches("^\\s*$")) {
				continue;
			}
			
			lineCounter++;
			if ((lineCounter%10000) == 0) {
				System.err.println("  " + lineCounter + " " + lineFromCelexLemmaLexicon);
				System.err.flush();
			}
			
			// Each line is a list of '\' separated fields. 
			// I am only interested in the first and second fields:
			//
			// 1) lemmaID
			// 2) citation form
			//
			// Insanely, to match a single '\' character in a file you need 4 '\' 
			// characters in the Pattern String. Two of them disappear because
			// of the string literal, and the two remaining match a '\' within
			// the regular expression.
			//
			Pattern citationFormPattern = Pattern.compile("^(\\d+)\\\\([^\\\\]+).*$"); 
			Matcher citationFormMatcher = 
				citationFormPattern.matcher(lineFromCelexLemmaLexicon);
			boolean matches = citationFormMatcher.matches();
		
			if ( ! matches) {
				System.err.println("The Celex Lexicon file is bad");
				System.err.println(" line: " + lineFromCelexLemmaLexicon);
				System.err.println("  EXITING...");
				System.err.println();
				return;
			}
			
			String lemmaIDAsString = citationFormMatcher.group(1);
			int lemmaID = Integer.valueOf(lemmaIDAsString);
			
			String citationForm = citationFormMatcher.group(2);
			
			// The Official Morphology Challenge German answer key is lowercased.  But
			// I don't need to do that for this paper. I can lower case things later
			// in a separate version of the answer key if I want.
			//citationForm = citationForm.toLowerCase();
			
			CelexLemmaData celexLemmaData = new CelexLemmaData(citationForm, lemmaID);
			celexLemmaLexiconByLemmaID.put(lemmaID, celexLemmaData);
		}
	}
	
	private void readCelexWordFormLexicon() throws NumberFormatException, IOException {
		System.err.println(); 
		System.err.println("Reading the German Celex Wordform Lexicon...");
		System.err.println();
		
		int lineCounter = 0;
		String lineFromCelexWordformLexicon;
		while ((lineFromCelexWordformLexicon = celexWordformsLexicon_BufferedReader.readLine()) != null) {
			
			// skip blank lines
			if (lineFromCelexWordformLexicon.matches("^\\s*$")) {
				continue;
			}
			
			lineCounter++;
			if ((lineCounter%10000) == 0) {
				System.err.println("  " + lineCounter + " " + lineFromCelexWordformLexicon);
				System.err.flush();
			}
			
			// Each line is a list of 5 '\' separated fields. 
			// I am interested in the second, fourth, and fifth fields:
			//
			// 1) wordform ID, useless
			// 2) wordform
			// 3) count of wordform in the 'Mannheim' corpus, useless
			// 4) lemmaID, to link this wordform up with its lemma/citation form
			// 5) comma separated celex feature codes
			//
			Pattern wordformPattern = 
				Pattern.compile("^(\\d+)\\\\([^\\\\]+)\\\\(\\d+)\\\\(\\d+)\\\\(.*)$"); 
			Matcher wordformMatcher = 
				wordformPattern.matcher(lineFromCelexWordformLexicon);
			boolean matches = wordformMatcher.matches();
		
			if ( ! matches) {
				System.err.println("The Celex Wordform lexiconfile is bad");
				System.err.println(" line: " + lineFromCelexWordformLexicon);
				System.err.println("  EXITING...");
				System.err.println();
				return;
			}
			
			String wordform = wordformMatcher.group(2);
			// skip wordforms containing a space
			if (wordform.matches("^.*\\s.*$")) {
				continue;
			}
			
			String lemmaIDAsString = wordformMatcher.group(4);
			int lemmaID = Integer.valueOf(lemmaIDAsString);
			CelexLemmaData celexLemmaDataOfWordForm = celexLemmaLexiconByLemmaID.get(lemmaID);
					
			String celexFeatureCodesAsString = wordformMatcher.group(5);
			String[] celexFeatureCodes = celexFeatureCodesAsString.split("\\s*,\\s*");
			
			// For the EMNLP 2007 paper deadline I had a bug in the German 'gold standard'.
			// I missed that the same wordform string could appear on more than 1 line of
			// a Celex 'wordform' lexicon 
			if ( ! celexLexiconByWordForm.containsKey(wordform)) {
				celexLexiconByWordForm.put(wordform, new AllCelexInfoForAWordform(wordform));
			}
			AllCelexInfoForAWordform allCelexInfoForWordform = 
				celexLexiconByWordForm.get(wordform);
			
			for (String celexFeatureCode : celexFeatureCodes) {
				CelexAnalysis aCelexAnalysisOfWordform = 
					new CelexAnalysis(
							wordform, 
							celexLemmaDataOfWordForm,
							celexFeatureCode);
				allCelexInfoForWordform.addCelexAnalysis(aCelexAnalysisOfWordform);
			}
			
		}
	}

	public BufferedReader openFileForReading(File fileToOpen) {
		
		BufferedReader bufferedReaderToReturn = null;
		
		try {
			bufferedReaderToReturn = 
				new BufferedReader(
						new InputStreamReader(
								new FileInputStream(fileToOpen),
							    "ISO-8859-1")); //latin 1
		}
		catch(FileNotFoundException e) {	
			System.err.println();
			System.err.println("  Sorry.  The file: " + fileToOpen.getAbsolutePath());
			System.err.println("    could not be read.  Here is the full Java error:");
			System.err.println();
			System.err.println(e.getMessage());
			System.err.println();
			System.err.println("  Did NOT successfully set the corpus path.");
			System.err.println();
			System.exit(0);
		}
		catch(Exception e) {
			System.err.println();
			System.err.println("  Sorry.  While opening the file: " + fileToOpen.getAbsolutePath());
			System.err.println("    an error was encountered.  Here is the full Java error:");
			System.err.println();
			System.err.println(e.getMessage());
			System.err.println();
			System.err.println("  Did NOT successfully set the corpus path.");
			System.err.println();
			System.exit(0);
		}
		
		return bufferedReaderToReturn;
	}
	
	private PrintWriter openFileForWriting(File fileToOpen, String encoding) {

		PrintWriter printWriterToReturn = null;
		
		try {
			printWriterToReturn = 
				new PrintWriter(
						new BufferedWriter(
								new OutputStreamWriter(
										new FileOutputStream(fileToOpen),
										encoding)),
						true); // true to autoflush
			
		} catch (FileNotFoundException e) {
			System.err.println();
			System.err.println("Cannot set the output file:");
			System.err.println("  " + fileToOpen.getAbsolutePath());
			System.err.println();
			System.exit(0);

		} catch (IOException e) {
			System.err.println("Failed to open the output file because");
			System.err.println("  of the following internal error:");
			e.printStackTrace();
			System.err.println();
			System.exit(0);
		}
		
		return printWriterToReturn;
	}

	
	/**
	 * @param args the command line arguments which must look like:
	 *
	 * <stem-data-file> <suffix-data-file>
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		if (args.length != 3) {
			System.out.println("The command line must look like:");
			System.out.println(
			 "    java AnalyzeCelexGermanIntoAMorphoChallengeAnswerKey " + String.format("%n") +
			 "        <path-to-CelexLemmaLexiconFile> " + String.format("%n") +
			 "        <path-to-CelexWordformsLexiconFile>" + String.format("%n") +
			 "        <path-to-MorphoChallengeAnswerKeyOutputFile>" + String.format("%n%n"));
			System.out.println("    Exiting...");
			System.out.println();
			System.out.println();
			System.exit(0);
		}
		
		AnalyzeCelexGermanIntoAMorphoChallengeAnswerKey_versionForEMNLP2007 converter = 
			new AnalyzeCelexGermanIntoAMorphoChallengeAnswerKey_versionForEMNLP2007(
					new File(args[0]), 
					new File(args[1]),
					new File(args[2]));
		
		converter.convertFromCelexToMorphoChallenge();
		converter.lowercaseMorphoChallengeAnswerKeyEntries();
		converter.writeMorphoChallengeAnswerKeys();
	}    
	
	public void convertFromCelexToMorphoChallenge() {
		System.err.println();
		System.err.println("Converting from Celex style features to Morphology Challenge Style answer key format");
		System.err.println();
		
		int wordformCounter = 0;
		for (String wordform : celexLexiconByWordForm.keySet()) {
			
			wordformCounter++;
			if ((wordformCounter % 10000) == 0) {
				System.err.println("  " + wordformCounter + " wordforms converted");
			}
			
			AllCelexInfoForAWordform celexInfoForAWordForm =
				celexLexiconByWordForm.get(wordform);
			
			MorphoChallengeAnswerKeyEntry morphoChallengeAnswerKeyEntry =
				new MorphoChallengeAnswerKeyEntry(wordform);
			
			morphoChallengeAnswerKeyEntriesByWordform.put(
					wordform, 
					morphoChallengeAnswerKeyEntry);
			
			for (CelexAnalysis celexAnalysis : celexInfoForAWordForm.celexAnalzses) {
				
				MorphoChallengeAnalysis morphoChallengeAnalysis = 
					new MorphoChallengeAnalysis(celexAnalysis);
				
				morphoChallengeAnswerKeyEntry.addMorphoChallengeAnalysis(
						morphoChallengeAnalysis);
			}
		}
	}
	
	// Sadly, the German corpus I have (from the Morphology Challenge 2007) is all
	// lower case. Hence the Morphology Challenge answer key needs to also be in
	// lower case. Morhology Challenge answer key entries may *collide* when they are
	// lower cased. So this must be handled.
	private void lowercaseMorphoChallengeAnswerKeyEntries() {
		
		System.err.println();
		System.err.println("Lower casing the MorpholChallengeAnswerKeyEntries");
		System.err.println();
		
		Map<String, MorphoChallengeAnswerKeyEntry> lowerCasedEntriesByLowerCasedWordform =
			new TreeMap<String, MorphoChallengeAnswerKeyEntry>();
		
		int answerKeyEntryCounter = 0;
		for (String caseSensitiveWordform : 
				morphoChallengeAnswerKeyEntriesByWordform.keySet()) {
			
			answerKeyEntryCounter++;
			if ((answerKeyEntryCounter % 10000) == 0) {
				System.err.println(
						answerKeyEntryCounter + 
						" Morphology Challenge Answer Key Entries have been lower cased");
			}
			
			String lowerCasedWordform = caseSensitiveWordform.toLowerCase();
			
			MorphoChallengeAnswerKeyEntry caseSensitiveAnswerKeyEntry = 
				morphoChallengeAnswerKeyEntriesByWordform.get(caseSensitiveWordform);
			
			MorphoChallengeAnswerKeyEntry lowerCasedAnswerKeyEntry =
				caseSensitiveAnswerKeyEntry.toLowerCase();
			
			if ( ! lowerCasedEntriesByLowerCasedWordform.containsKey(lowerCasedWordform)) {
				lowerCasedEntriesByLowerCasedWordform.put(
						lowerCasedWordform, 
						lowerCasedAnswerKeyEntry);
				
			// We must combine answer key entries that collided during case lowering
			} else { 
				
				MorphoChallengeAnswerKeyEntry squattingAnswerKeyEntry =
					lowerCasedEntriesByLowerCasedWordform.get(lowerCasedWordform);
				
				MorphoChallengeAnswerKeyEntry compositeAnswerKeyEntry =
					MorphoChallengeAnswerKeyEntry.combineTwoAnswerKeyEntries (
							squattingAnswerKeyEntry,
							lowerCasedAnswerKeyEntry);
								
				lowerCasedEntriesByLowerCasedWordform.put(
						lowerCasedWordform,
						compositeAnswerKeyEntry);
			}
		}
		
		morphoChallengeAnswerKeyEntriesByWordform = lowerCasedEntriesByLowerCasedWordform;
	}

	private void writeMorphoChallengeAnswerKeys() {
		
		System.err.println();
		System.err.println("Writing out the Morphology Challenge answer key");
		System.err.println();
		
		int wordformCounter = 0;
		for (MorphoChallengeAnswerKeyEntry morphoChallengeAnswerKeyEntry : 
			morphoChallengeAnswerKeyEntriesByWordform.values()) {
			
			wordformCounter++;
			if ((wordformCounter % 10000) == 0) {
				System.err.println("  " + wordformCounter + " wordforms written");
			}
			
			morphoChallengeAnswerKey_PrintWriter.println(
					morphoChallengeAnswerKeyEntry.toString());
		}
	}	
}

