package cmonson.morphologyChallengeUtilities;

import java.io.*;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * The Goldstandard answer key for German, even just morphological one, has had bug-fixes
 * since the deadline for EMNLP-07 (March 2007). So the results I report in the final
 * version of the paper may end up slightly different than the results reported in
 * the originally submitted version.
 * 
 * @author cmonson
 *
 */
public class AnalyzeCelexGermanIntoAMorphoChallengeAnswerKey {
	
	/*
	 * A surface Word form may occur on/in more than one line of a Celex Lexicon file.
	 * This happens when a single word form can be analyzed both as a verb and
	 * as a noun, for example.
	 */
	/* I am going to push my internal Java representation of the Celex database
	 * closer to representation Celex has itself, and so I am going to index by
	 * Celex wordformID instead of by wordform itself. Each wordform entry in 
	 * Celex has a unique wordformID, but wordform strings themselves may collide.
	 * Separate POS entries with the same surface form have separate Celex entries.
	 * By keeping things closer to Celex, I'll be able to more closely mimick the
	 * Morpho Challenge 2007 goldstandard analysis file.

	private static class AllCelexInfoForAWordform {
		String wordform;
		List<CelexAnalysis> celexAnalzses = new ArrayList<CelexAnalysis>();
		
		public AllCelexInfoForAWordform(String wordform) {
			this.wordform = wordform;
		}
		
		public void addCelexAnalysis(CelexAnalysis celexAnalysis) {
			celexAnalzses.add(celexAnalysis);
		}
		
		@Override
		public String toString() {
			String toReturn = "";
			toReturn += wordform;
			for (CelexAnalysis celexAnalysis : celexAnalzses) {
				toReturn += " " + celexAnalysis.toString();
			}
			return toReturn;
		}
	}
	*/
	
	// A celex feature string is something like 'dS', or 'aP', or 'dS,aP'
	private static class CelexWordformData {
		int wordformID;
		String wordform;
		CelexLemmaData lemmaData;
		String celexFeatureString;
	
		private 
		CelexWordformData(
				int wordformID,
				String wordForm, 
				CelexLemmaData lemmaData,  
				String celexFeatureString) {
			
			this.wordformID = wordformID;
			this.wordform = wordForm;
			this.lemmaData = lemmaData;
			this.celexFeatureString = celexFeatureString;
		}
		
		@Override
		public String toString() {
			return wordform + "#" + lemmaData + "#" + celexFeatureString;
		}
	}
	
	private static class CelexLemmaData {
		String citationForm;
		int lemmaID;
		private String derivationalStructure;
		
		private CelexLemmaData(
				String citationForm, 
				int lemmaID, 
				String derivationalStructure) {
			
			this.citationForm = citationForm;
			this.lemmaID = lemmaID;
			this.derivationalStructure = derivationalStructure;
		}
		
		@Override
		public String toString() {
			return citationForm + "#" + lemmaID + "#" + derivationalStructure;
		}
	}
	
	private static class MorphoChallengeAnswerKeyEntry {
		
		String wordform;
		
		Set<MorphoChallengeAnalysis> morphoChallengeAnalyses =
			new HashSet<MorphoChallengeAnalysis>();
		
		public MorphoChallengeAnswerKeyEntry(String wordForm) {
			this.wordform = wordForm;
		}

		public void 
		addMorphoChallengeAnalysis(MorphoChallengeAnalysis morphoChallengeAnalysis) {
			morphoChallengeAnalyses.add(morphoChallengeAnalysis);
		}

		public MorphoChallengeAnswerKeyEntry toLowerCase() {

			MorphoChallengeAnswerKeyEntry lowerCasedAnswerKeyEntry =
				new MorphoChallengeAnswerKeyEntry(wordform.toLowerCase());
			
			for (MorphoChallengeAnalysis morphoChallengeAnalysis : morphoChallengeAnalyses) {
				
				MorphoChallengeAnalysis lowerCasedMorphoChallengeAnalysis =
					morphoChallengeAnalysis.toLowerCase();
				
				lowerCasedAnswerKeyEntry.addMorphoChallengeAnalysis(
						lowerCasedMorphoChallengeAnalysis);
			}
			
			return lowerCasedAnswerKeyEntry;
		}	
	
		@Override
		public String toString() {			
			String toReturn = "";
			
			toReturn += wordform + "\t";
			
			boolean first = true;
			for (MorphoChallengeAnalysis morphoChallengeAnalysis : morphoChallengeAnalyses) {
				
				if (first) {
					first = false;
				} else {
					toReturn += ", ";
				}
				
				toReturn += morphoChallengeAnalysis.getFeatureString();
			}
			
			return toReturn;
		}

		// place all the MorphoChallengeAnalysis instances belonging to answerKeyEntry1
		// or to answerKeyEntry2 into a single new MorphoChallengeAnswerKeyEntry for
		// the common wordform of answerKeyEntry1 and answerKeyEntry2
		public static 
		MorphoChallengeAnswerKeyEntry combineTwoAnswerKeyEntries(
				MorphoChallengeAnswerKeyEntry answerKeyEntry1, 
				MorphoChallengeAnswerKeyEntry answerKeyEntry2) {
			
			if ( ! answerKeyEntry1.wordform.equals(answerKeyEntry2.wordform)) {
				System.err.println();
				System.err.println("ERROR: Can't combine MorphoChallengeAnswerKeyEntry's");
				System.err.println("  with different 'wordforms': ");
				System.err.println("    " + answerKeyEntry1.wordform);
				System.err.println("    " + answerKeyEntry2.wordform);
				System.err.println();
				
				return null;
			}
			
			MorphoChallengeAnswerKeyEntry combinedAnswerKeyEntry =
				new MorphoChallengeAnswerKeyEntry(answerKeyEntry1.wordform);
			
			for (MorphoChallengeAnalysis morphoChallengeAnalysis : 
						answerKeyEntry1.morphoChallengeAnalyses) {
				
				combinedAnswerKeyEntry.addMorphoChallengeAnalysis(morphoChallengeAnalysis);
			}
			for (MorphoChallengeAnalysis morphoChallengeAnalysis : 
						answerKeyEntry2.morphoChallengeAnalyses) {
		
				combinedAnswerKeyEntry.addMorphoChallengeAnalysis(morphoChallengeAnalysis);
			}
			
			return combinedAnswerKeyEntry;
		}

	}
	
	private static class MorphoChallengeAnalysis {
		
		// A central repository for the official feature names that are output to
		// the morpho challenge style answer key. These constants are particularly
		// important when more than one MACO feature string encodes the same
		// morphosyntactic feature--especially when that morphosyntactic feature is
		// always encoded by the same morpheme, such as 's', 'a', and 'o' which 
		// consistently mark plural, feminine, and masculine respectively accross
		// various MACO parts of speech (adjective, determiner, etc.)
		//
		/*
		private static final String SINGULAR      = "+sg";
		private static final String PLURAL        = "+pl";
		
		private static final String NOMINATIVE    = "+nom";
		private static final String ACCUSATIVE    = "+acc";
		private static final String DATIVE        = "+dat";
		private static final String GENATIVE      = "+gen";
		
		private static final String FIRST_PERSON  = "+1st";
		private static final String SECOND_PERSON = "+2nd";
		private static final String THIRD_PERSON  = "+3rd";
		
		private static final String INDICATIVE    = "+indic";
		private static final String SUBJUNCTIVE   = "+subjunc";
		private static final String IMPERATIVE    = "+imperative";
		
		private static final String INFINITIVE    = "+inf";
		private static final String PARTICIPLE    = "+part";
		
		private static final String PRESENT       = "+pres";
		private static final String PAST          = "+past";
		*/
		
		//private static final String POSITIVE      = "+positive";
		private static final String COMPARATIVE   = "+CMP";
		private static final String SUPERLATIVE   = "+SUP";
		
		// These are endings that go on adjectives. I think these are case
		// and number endings for when adjectives behave as nouns.  But
		// CELEX unhelpfully just lists them as: 'Suff_e', 'Suff_en', etc.
		private static final String ADJ_SUFF_E    = "+ADJ_e";
		private static final String ADJ_SUFF_EN   = "+ADJ_en";
		private static final String ADJ_SUFF_ER   = "+ADJ_er";
		private static final String ADJ_SUFF_EM   = "+ADJ_em";
		private static final String ADJ_SUFF_ES   = "+ADJ_es";
		private static final String ADJ_SUFF_S    = "+ADJ_s";
		
		String wordform;
		
		// a morpho challenge 'feature' is any string, that string
		// could be a stem, it could be a literal morpheme, it could
		// be an abstract string representation of one or more
		// morphosyntactic features.
		//
		// Although the citation form is just another 'feature' as far as
		// Morpho challenge is concerned. I sometimes need to treat the
		// citation form as a special case: like when lower casing a
		// MorphoChallengeAnalysis, and so it is stored separately.
		String citationForm;
		List<String> inflectionalFeatures = new ArrayList<String>();
		List<String> derivationalFeatures = new ArrayList<String>();
		
		
		public MorphoChallengeAnalysis(
				String wordForm, 
				String citationForm, 
				List<String> inflectionalFeatures, 
				List<String> derivationalFeatures) {
			
			this.wordform = wordForm;
			this.citationForm = citationForm;
			this.inflectionalFeatures = inflectionalFeatures;
			this.derivationalFeatures = derivationalFeatures;
		}
		
		public static List<MorphoChallengeAnalysis> 
		convertToListOfMorphoChallengeAnalyses(CelexWordformData celexWordformData) {
			
			List<MorphoChallengeAnalysis> morphoChallengeAnalyses =
				new ArrayList<MorphoChallengeAnalysis>();
			
			String wordform = celexWordformData.wordform;
			String citationForm = celexWordformData.lemmaData.citationForm;

			// Each CelexWordformData may have a list of  
			String[] celexFeatureStringsArray = 
				celexWordformData.celexFeatureString.split("\\s*,\\s*");
			Set<String> celexFeatureStringsAsSet = new HashSet<String>();
			for (String celexFeatureString : celexFeatureStringsArray) {
				celexFeatureStringsAsSet.add(celexFeatureString);
			}
			
			for (String celexFeatureString : celexFeatureStringsArray) {

				ArrayList<String> features = new ArrayList<String>();

				if (celexWordformData.celexFeatureString.matches(
						                                  "^.*(i|z|p|E|A|1|2|3|I|K|r).*$")) {
					features =
						celexFeatureCombinationToMorphoChallengeFeatures_verbal(
							celexFeatureString, 
							celexFeatureStringsAsSet, 
							celexWordformData);
					
				} else if (celexWordformData.celexFeatureString.matches("^.*(n|g|d|a).*$")) {

					features =
						celexFeatureCombinationToMorphoChallengeFeatures_nominal(
							celexFeatureString, 
							celexFeatureStringsAsSet, 
							celexWordformData);

				} else if (celexWordformData.celexFeatureString.matches("^.*(o|c|u).*$")) {

					features =
						celexFeatureCombinationToMorphoChallengeFeatures_adjectival(
							celexFeatureString, 
							celexFeatureStringsAsSet, 
							celexWordformData);

				} else if (celexWordformData.celexFeatureString.matches("^.*X.*$")) {
					// Do nothing

				} else {
					//System.err.println(" Not a verb, noun, or adj:");
					//System.err.println();
					//System.err.println(celexAnalysis.toString());
					//System.err.println();
					//System.err.println();
				}
				
				// features will be null if we should NOT generate ANY interpretation
				// of this wordform from the current 'celexFeatureString'
				if (features != null) {

					/* For an answer key that closely mimics the Morpho Challenge 2007
					 * Goldstandard, INCLUDING an analysis of the derivational morphology
					 * of each word, comment this in.
					 *
					ArrayList<String> derivationalFeatures = 
						getDerivationalFeatures(celexWordformData.lemmaData);
						*/
					
					// I want to create an answer key that closely mimics the
					// inflectional morphology portion of the Morpho Challenge 2007
					// Goldstandard. And just uses the bare citation form as the
					// 'derivational features'.
					ArrayList<String> derivationalFeatures =
						new ArrayList<String>();
					derivationalFeatures.add(citationForm);
					
					MorphoChallengeAnalysis morphoChallengeAnalysis = 
						new MorphoChallengeAnalysis(
								wordform,
								citationForm,
								features,
								derivationalFeatures);

					morphoChallengeAnalyses.add(morphoChallengeAnalysis);
				}
				
			}
			
			return morphoChallengeAnalyses;
		}
		

		/*
		 * Convert one coherent Celex feature group, such as '13SIE', into
		 * features that closely mimic the official Morpho Challenge 2007 gold standard.
		 * 
		 * 'celexFeatureString' holds the one coherent Celex feature group.
		 * Sometimes the Morpho Challenge 2007 output features depend on what other
		 * Celex feature sets this Celex wordform can mark, so we pass in all the
		 * other feature sets this Celex wordform can mark as 'celecFeatureStringsAsSet'
		 * 'celexWordformData' is only used to print out helpful debugging.
		 */
		private static ArrayList<String> 
		celexFeatureCombinationToMorphoChallengeFeatures_verbal(
				String celexFeatureString, 
				Set<String> celexFeatureStringsAsSet, 
				CelexWordformData celexWordformData) {
			
			ArrayList<String> features = new ArrayList<String>();
			
			// Non-Finite Forms
			//
			// Present Participle
			if (celexFeatureString.matches("pE")) {
				features.add("+PCP1");
				
			// Past Participle
			} else if (celexFeatureString.matches("pA")) {
				features.add("+PCP2");
				
			// Infinitive
			} else if (celexFeatureString.matches("i")) {
				features.add("+INF");
				
			// Infinitive, a 'zu'-infinitive e.g. 'loszulassen'
			} else if (celexFeatureString.matches("z")) {
				features.add("+INF");
				
			// Imperatives
			//
			// Imperative Singular (2nd person)
			} else if (celexFeatureString.matches("rS")) {
				features.add("+IMPV2SG");
				
			// Imperative Plural (2nd person).
			//
			// The Morpho Challenge 2007 answer key produces NO output when it
			// finds a plural imperative
			} else if (celexFeatureString.matches("rP")) {
				// rP never occurs as the only analysis of any wordform in Celex
				//
				// Return null to not output any Morpho Challenge
				// answer corresponding to this 'celexFeatureString'.
				return null;
			
			// Present Tense
			//
			// Singular
			//
			// 1st Indicative (Present Singular)
			} else if (celexFeatureString.matches("1SIE")) {
				// the Morpho Challenge 2007 answer key uses the same
				// feature here as it does for past tense 1st/3rd person
				// so the '13' is not a typo
				features.add("+13SG"); 
			
			// 1st/3rd Subjunctive (Present Singular)
			} else if (celexFeatureString.matches("13SKE")) {
				// The only feature here that Morpho Challenge 2007 considers as 
				// the marked choice is Subjunctive.
				features.add("+CONJ"); 
				
			// 2nd Indicative (Present Singular)
			} else if (celexFeatureString.matches("2SIE")) {
				features.add("+2SG"); 
			
			// 2nd Subjunctive (Present Singular)
			//
			// There are no examples of what to do in this case from the sample
			// of the Morpho Challenge 2007 gold standard. My best guess is that the
			// Morpho Challenge 2007 gold standard will
			// add a +CONJ (+subjunctive) feature to the output for 2nd Indicative
			} else if (celexFeatureString.matches("2SKE")) {
				features.add("+2SG");
				features.add("+CONJ"); 
				
			// 3rd Indicative (Present Singular)
			} else if (celexFeatureString.matches("3SIE")) {
				features.add("+3SG"); 
			
				
			// Plural (Present)
			//
			// 1st/3rd Indicative (Present Plural)
			} else if (celexFeatureString.matches("13PIE")) {
				features.add("+13PL");
				
			// 1st/3rd Subjunctive (Present Plural)
			} else if (celexFeatureString.matches("13PKE")) {
				// An indicative interpretation of a form overrides an interpretation
				// in the subjunctive. But if no indicative interpretation is possible
				// of this wordform then we should output features (including +CONJ)
				if ( ! celexFeatureStringsAsSet.contains("13PIE")) {
					features.add("+13PL");
					features.add("+CONJ");
				} else {
					// Return null to not output any Morpho Challenge
					// answer corresponding to this 'celexFeatureString'.
					return null;
				}
				
			// 2nd Indicative (Present Plural)
			} else if (celexFeatureString.matches("2PIE")) {
				features.add("+2PL");

			// 2nd Subjunctive (Present Plural)
			} else if (celexFeatureString.matches("2PKE")) {
				// An indicative interpretation of a form overrides an interpretation
				// in the subjunctive. But if no indicative interpretation is possible
				// of this wordform then we should output features (including +CONJ)
				if ( ! celexFeatureStringsAsSet.contains("2PIE")) {
					features.add("+2PL");
					features.add("+CONJ");
				} else {
					// Return null to not output any Morpho Challenge
					// answer corresponding to this 'celexFeatureString'.
					return null;
				}
				
				
			// Past Tense
			//
			// Singular
			//
			// 1st/3rd Indicative (Past Singular)
			} else if (celexFeatureString.matches("13SIA")) {
				features.add("+PAST");
				
				// The Morpho Challenge gold standard does a bit of a funky dance
				// here. I think it must be strong stems that form the past 1st/3rd
				// indicative without adding a final 'e'. But the subjunctive does
				// add an 'e' (or maybe 'te', I'm not sure.) At any rate, for these
				// strong stems the feature +13SG does not appear in the gold standard,
				// (I presume because with no 'e' it could be argued there is no overt 
				// marking of +13SG.)
				// It may be that Celex somewhere encodes a stem as strong vs. weak.
				// and it may be that the morpho challenge gold standard is using
				// the strong vs. weak feature to decide when to add the +SG feature
				// in. But I am going to cop out and stick in +13SG whenever a
				// subjunctive interpretation of the wordform is possible.
				if (celexFeatureStringsAsSet.contains("13SKA")) {
					features.add("+13SG");
				}
				
			// 1st/3rd Subjunctive (Past Singular)
			} else if (celexFeatureString.matches("13SKA")) {
				// An indicative interpretation of a form overrides an interpretation
				// in the subjunctive. But if no indicative interpretation is possible
				// of this wordform then we should output features (including +CONJ)
				if ( ! celexFeatureStringsAsSet.contains("13SIA")) {
					features.add("+CONJ");
					features.add("+PAST");
					features.add("+13SG");
				} else {
					// Return null to not output any Morpho Challenge
					// answer corresponding to this 'celexFeatureString'.
					return null;
				}

			// 2nd Indicative (Past Singular)
			} else if (celexFeatureString.matches("2SIA")) {
				features.add("+PAST");
				features.add("+2SG");
				
			// 2nd Subjunctive (Past Singular)
			} else if (celexFeatureString.matches("2SKA")) {
				// An indicative interpretation of a form overrides an interpretation
				// in the subjunctive. But if no indicative interpretation is possible
				// of this wordform then we should output features (including +CONJ)
				if ( ! celexFeatureStringsAsSet.contains("2SIA")) {
					features.add("+CONJ");
					features.add("+PAST");
					features.add("+2SG");
				} else {
					// Return null to not output any Morpho Challenge
					// answer corresponding to this 'celexFeatureString'.
					return null;
				}

			// Plural (Past)
			//
			// 1st/3rd Indicative
			} else if (celexFeatureString.matches("13PIA")) {
				features.add("+PAST");
				features.add("+13PL");
				
			// 1st/3rd Subjunctive (Past Singular)
			} else if (celexFeatureString.matches("13PKA")) {
				// An indicative interpretation of a form overrides an interpretation
				// in the subjunctive. But if no indicative interpretation is possible
				// of this wordform then we should output features (including +CONJ)
				if ( ! celexFeatureStringsAsSet.contains("13PIA")) {
					features.add("+CONJ");
					features.add("+PAST");
					features.add("+13PL");
				} else {
					// Return null to not output any Morpho Challenge
					// answer corresponding to this 'celexFeatureString'.
					return null;
				}

			// 2nd Indicative
			} else if (celexFeatureString.matches("2PIA")) {
				features.add("+PAST");
				features.add("+2PL");
				
			// 1st/3rd Subjunctive (Past Singular)
			} else if (celexFeatureString.matches("2PKA")) {
				// An indicative interpretation of a form overrides an interpretation
				// in the subjunctive. But if no indicative interpretation is possible
				// of this wordform then we should output features (including +CONJ)
				if ( ! celexFeatureStringsAsSet.contains("2PIA")) {
					features.add("+CONJ");
					features.add("+PAST");
					features.add("+2PL");
				} else {
					// Return null to not output any Morpho Challenge
					// answer corresponding to this 'celexFeatureString'.
					return null;
				}
				
			} else {
				System.err.println();
				System.err.println(" Strange verbal form encountered:");
				System.err.println("   the offending celex feature string is: " + 
								     celexFeatureString);
				System.err.println("   from the celexWordformData: " + celexWordformData);
				System.err.println();
			}
			
			return features;
		}
		

		private static ArrayList<String> 
		celexFeatureCombinationToMorphoChallengeFeatures_nominal(
				String celexFeatureString, 
				Set<String> celexFeatureStringsAsSet, 
				CelexWordformData celexWordformData) {
			
			ArrayList<String> features = new ArrayList<String>();
			
			/*
			 * The Morpho Challenge 2007 gold standard tries to not output 
			 * Morpho-Challenge-style features for morphosyntactic features,
			 * such as 'singular' that are not marked by some change in the
			 * surface form. For this reason, they handle German nouns as follows:
			 * 
			 * 1) If this wordform marks nominative singular, then if 'celexFeatureString'
			 *    is 'nS', then output no Morpho-Challenge-style inflectional features 
			 *    for this interpretation of this wordform. But if 'celexFeatureString' is
			 *    anything besides 'nS', output no Morpho-Challenge interpretation
			 *    whatever corresponding to this 'celesFeatureString'
			 *    
			 * 2) If not 1), then there are 2 cases,
			 * 
			 *    A) Singular. for each singular case this wordform marks output
			 *       a separate Morpho-Challenge-Style interpretation for 
			 *       that case. i.e. 'celexFeatureString': aS -> +ACC, dS -> +DAT, gS -> +GEN
			 *     
			 *    B) Plural. 
			 *       i) If this wordform marks nominative plural, then output
			 *          +PL as a Morpho-Challenge-style feature, if
			 *          'celexFeatureString' is 'nP'...
			 *       ii) if not 2Bi), then for each plural case this wordform
			 *           marks, output an interpretation with that case and plural
			 *           as Morpho-Challenge-style features. i.e. aP -> +ACC +PL,
			 *           dP -> +DAT +PL, gP -> +GEN +PL. (but in reality, aP and
			 *           gP never occur alone in Celex.)
			 */
			if (celexFeatureStringsAsSet.contains("nS")) {
				// if this wordform marks nominative singular, and we are currently
				// handling that nominitave singular interpretation of this wordform,
				// then we want to output an interpretation for this word form that
				// has no inflectional features, i.e. with an empty 'features' array.
				if (celexFeatureString.matches("nS")) {
					return features;
				}
				// but if we are not currently handling the nominative singular
				// interpretation of this wordform, then we don't want to output any
				// Morpho-Challenge-style interpretation of this wordform for this
				// number-case combination.
				return null;
			}
			// The rest of the singulars
			if (celexFeatureString.matches("^.S$")) {
				
				if (celexFeatureString.matches("aS")) {
					features.add("+ACC");
					
				} else if (celexFeatureString.matches("dS")) {
					features.add("+DAT");
					
				} else if (celexFeatureString.matches("gS")) {
					features.add("+GEN");
					
				} else {
					System.err.println();
					System.err.println("WARNING: found a Singular noun that doesn't");
					System.err.println("  match normal cases! " + celexFeatureString);
					System.err.println("    from: " + celexWordformData);
				}
			
			// Plurals
			} else {
				
				if (celexFeatureStringsAsSet.contains("nP")) {
					// if this wordform marks nominative plural, and we are currently
					// handling that nominitave plural interpretation of this wordform,
					// then we want to output an interpretation for this word form that
					// has the inflectional feature +PL.
					if (celexFeatureString.matches("nP")) {
						features.add("+PL");

					// but if we are not currently handling the nominative plural
					// interpretation of this wordform, then we don't want to output any
					// Morpho-Challenge-style interpretation of this wordform for this
					// plural-case.
					} else {
						return null;
					}
					
				// plurals when nominative plural is absent
				} else {
					
					if (celexFeatureString.matches("aP")) {
						features.add("+ACC");
						features.add("+PL");
						
					} else if (celexFeatureString.matches("dP")) {
						features.add("+DAT");
						features.add("+PL");
						
					} else if (celexFeatureString.matches("gP")) {
						features.add("+GEN");
						features.add("+PL");
						
					} else  {
						System.err.println();
						System.err.println("WARNING: found a Plural noun that doesn't");
						System.err.println("  match normal cases! " + celexFeatureString);
						System.err.println("    from: " + celexWordformData);
					}
				}
			}
			
			return features;
		}

		
		private static ArrayList<String> 
		celexFeatureCombinationToMorphoChallengeFeatures_adjectival(
				String celexFeatureString, 
				Set<String> celexFeatureStringsAsSet, 
				CelexWordformData celexWordformData) {
			
			ArrayList<String> features = new ArrayList<String>();
			
			// DEGREE is marked separately from the quasi-nominal endings and
			// so each counts as a separate Morphology Challenge feature.
			
			// DEGREE: POSITIVE, COMPARATIVE, SUPERLATIVE
			//
			// POSITIVE is an unmarked feature, so don't write out a
			// Morphology Challenge feature for it.
			boolean degreeMarked = false;
			if (celexFeatureString.matches("^.*o.*$")) {
				degreeMarked = true;
			}
			if (celexFeatureString.matches("^.*c.*$")) {
				if (degreeMarked) {
					System.err.println();
					System.err.println(
							"ERROR: Degree has already been specified! " + celexFeatureString);
					System.err.println();
				}
				features.add(COMPARATIVE);
				degreeMarked = true;
			}
			if (celexFeatureString.matches("^.*u.*$")) {
				if (degreeMarked) {
					System.err.println();
					System.err.println(
							"ERROR: Degree has already been specified! " + celexFeatureString);
					System.err.println();
				}
				features.add(SUPERLATIVE);
				degreeMarked = true;
			}

			// The special (unhelpful) adjective (nominalized) endings
			// I don't know for certain that these are mutually exclusive.
			if (celexFeatureString.matches("^.*4.*$")) {
				features.add(ADJ_SUFF_E);
			}
			if (celexFeatureString.matches("^.*5.*$")) {
				features.add(ADJ_SUFF_EN);
			}
			if (celexFeatureString.matches("^.*6.*$")) {
				features.add(ADJ_SUFF_ER);
			}
			if (celexFeatureString.matches("^.*7.*$")) {
				features.add(ADJ_SUFF_EM);
			}
			if (celexFeatureString.matches("^.*8.*$")) {
				features.add(ADJ_SUFF_ES);
			}
			if (celexFeatureString.matches("^.*9.*$")) {
				features.add(ADJ_SUFF_S);
			}
			
			return features;
		}
		

		// The derivationalFeatures may be empty
		@SuppressWarnings("unused")
		private static ArrayList<String> getDerivationalFeatures(CelexLemmaData lemmaData) {
			ArrayList<String> derivationalFeatures = new ArrayList<String>();
			
			String derivationalStruture = lemmaData.derivationalStructure;
	
			//    \\(        match a '('
			//    (\\w+)     group 1 - a stem or derivational morpheme
			//    \\)        match '('
			//    \\[        match '['
			//    ([^\\]]+)  group 2 - a sequence of characters that are NOT ']' - the POS
			//    \\]        match ']'
			// 
			Pattern morphemeAndPOSPattern = Pattern.compile("\\((\\w+)\\)\\[([^\\]]+)\\]"); 
			Matcher citationFormMatcher = morphemeAndPOSPattern.matcher(derivationalStruture);
			
			boolean found = true;
			while (found) {
				found = citationFormMatcher.find();
				if (found) {
					String morpheme = citationFormMatcher.group(1);
					String POSString = citationFormMatcher.group(2);
					String derivationalFeature = morpheme;
					if ( ! POSString.contains("|")) {
						derivationalFeature += "_" + POSString;
					}
					derivationalFeatures.add(derivationalFeature);
				}
			}
			
			// If no derivational features are specified in Celex, then
			// for Morpho Challenge 2007 the 'derivational features' are just
			// the citatation form
			if (derivationalFeatures.size() == 0) {
				derivationalFeatures.add(lemmaData.citationForm);
			}

			return derivationalFeatures;
		}

		public String getFeatureString() {
			String featureString = "";
			featureString += getDerivationalFeatureString();
			String inflectionalFeatureString = getInflectionalFeatureString();
			if (inflectionalFeatureString.length() > 0) {
				featureString += " ";
				featureString += getInflectionalFeatureString();
			}
			
			return featureString;
		}
		
		public String getInflectionalFeatureString() {
			String inflectionalFeatureString = "";
			
			boolean first = true;
			for (String feature : inflectionalFeatures) {
				if (first) {
					first = false;
				} else {
					inflectionalFeatureString += " ";
				}
				inflectionalFeatureString += feature;
			}
			
			return inflectionalFeatureString;
		}

		public String getDerivationalFeatureString() {
			String derivationalFeatureString = "";
			
			boolean first = true;
			for (String feature : derivationalFeatures) {
				if (first) {
					first = false;
				} else {
					derivationalFeatureString += " ";
				}
				derivationalFeatureString += feature;
			}
			
			return derivationalFeatureString;
		}
		
		public MorphoChallengeAnalysis toLowerCase() {

			MorphoChallengeAnalysis lowerCasedMorphoChallengeAnalysis =
				new MorphoChallengeAnalysis(
						wordform.toLowerCase(),
						citationForm.toLowerCase(),
						inflectionalFeatures,
						derivationalFeatures);
			
			return lowerCasedMorphoChallengeAnalysis;
		}
		
		@Override
		public String toString() {
			String toReturn = "";
			toReturn += wordform;
			toReturn += getDerivationalFeatureString();
			toReturn += getInflectionalFeatureString();
			return toReturn;
		}
		
		@Override
		public int hashCode() {
			return citationForm.hashCode() * inflectionalFeatures.hashCode();
		}
		
		@Override
		public boolean equals(Object o) {
			if ( ! (o instanceof MorphoChallengeAnalysis)) {
				return false;
			}
			MorphoChallengeAnalysis that = (MorphoChallengeAnalysis)o;
			
			if ( ! this.wordform.equals(that.wordform)) {
				return false;
			}
			
			if ( ! this.citationForm.equals(that.citationForm)) {
				return false;
			}
			
			if ( ! this.inflectionalFeatures.containsAll(that.inflectionalFeatures)) {
				return false;
			}
			
			if ( ! that.inflectionalFeatures.containsAll(this.inflectionalFeatures)) {
				return false;
			}
						
			return true;
		}


	}
	
	private BufferedReader celexLemmaLexicon_BufferedReader;
	private BufferedReader celexWordformsLexicon_BufferedReader;
	private PrintWriter morphoChallengeAnswerKey_PrintWriter;
	
	// The Celex Lemma lexicon is initially read into this data structure.
	// Then while reading the Celex Wordform lexicon, the lemma info is
	// added into the info for each wordform
	private Map<Integer, CelexLemmaData> celexLemmaLexiconByLemmaID = 
		new TreeMap<Integer, CelexLemmaData>();
	
	private Map<Integer, CelexWordformData> celexWordformsByWordformID = 
		new TreeMap<Integer, CelexWordformData>();
	
	private Map<String, MorphoChallengeAnswerKeyEntry> 
		morphoChallengeAnswerKeyEntriesByWordform =
			new TreeMap<String, MorphoChallengeAnswerKeyEntry>();
	
	
	public 
	AnalyzeCelexGermanIntoAMorphoChallengeAnswerKey(
			File celexLemmaLexiconFile,
			File celexWordFormsLexiconFile,
			File morphoChallengeAnswerKeyFile) throws IOException {
		
		celexLemmaLexicon_BufferedReader =
			openFileForReading(celexLemmaLexiconFile);
		celexWordformsLexicon_BufferedReader =
			openFileForReading(celexWordFormsLexiconFile);
		
		morphoChallengeAnswerKey_PrintWriter = 
			openFileForWriting(morphoChallengeAnswerKeyFile, "ISO-8859-1"); // latin-1
		
		readCelexLexicons();
	}

	private void readCelexLexicons() throws IOException {
		readCelexLemmaLexicon();
		readCelexWordformLexicon();
	}
	
	private void readCelexLemmaLexicon() throws IOException {
		System.err.println();
		System.err.println("Reading the German Celex Lemma Lexicon...");
		System.err.println();
		
		int lineCounter = 0;
		String lineFromCelexLemmaLexicon;
		while ((lineFromCelexLemmaLexicon = celexLemmaLexicon_BufferedReader.readLine()) != null) {
			
			// skip blank lines
			if (lineFromCelexLemmaLexicon.matches("^\\s*$")) {
				continue;
			}
			
			lineCounter++;
			if ((lineCounter%10000) == 0) {
				System.err.println("  " + lineCounter + " " + lineFromCelexLemmaLexicon);
				System.err.flush();
			}
			
			// Each line is a list of '\' separated fields. 
			// I am interested in the 1st, 2nd, and 14th fields:
			//
			// 1)  lemmaID
			// 2)  citation form
			//  ...
			// 14) derivational structure, including all the smallest stems and
			//     derivational morphemes. The derivational structure may be empty
			//  ...
			//
			// Some of the fields between 2 and 14 may be empty
			
			// Insanely, to match a single '\' character in a file you need 4 '\' 
			// characters in the Pattern String. Two of them disappear because
			// of the string literal, and the two remaining match a '\' within
			// the regular expression.
			//
			Pattern citationFormPattern =             
				//                lemmaID \ cit. form \ 11 unneeded fields\ deriv. struct. \... 
				Pattern.compile("^(\\d+)\\\\([^\\\\]+)\\\\([^\\\\]*\\\\){11}([^\\\\]*).*$"); 
			Matcher citationFormMatcher = 
				citationFormPattern.matcher(lineFromCelexLemmaLexicon);
			boolean matches = citationFormMatcher.matches();
		
			if ( ! matches) {
				System.err.println("The Celex Lexicon file is bad");
				System.err.println(" line: " + lineFromCelexLemmaLexicon);
				System.err.println("  EXITING...");
				System.err.println();
				System.exit(0);
			}
			
			String lemmaIDAsString = citationFormMatcher.group(1);
			int lemmaID = Integer.valueOf(lemmaIDAsString);
			
			String citationForm = citationFormMatcher.group(2);
			// group(3) is the 11 unneeded fields
			String derivationalStructure = citationFormMatcher.group(4);
			
			// The Official Morphology Challenge German answer key is lowercased.  But
			// I don't need to do that for this paper. I can lower case things later
			// in a separate version of the answer key if I want.
			//citationForm = citationForm.toLowerCase();
			
			CelexLemmaData celexLemmaData = 
				new CelexLemmaData(
						citationForm, 
						lemmaID,
						derivationalStructure);
			celexLemmaLexiconByLemmaID.put(lemmaID, celexLemmaData);
		}
	}
	
	private void readCelexWordformLexicon() throws NumberFormatException, IOException {
		System.err.println(); 
		System.err.println("Reading the German Celex Wordform Lexicon...");
		System.err.println();
		
		int lineCounter = 0;
		String lineFromCelexWordformLexicon;
		while ((lineFromCelexWordformLexicon = celexWordformsLexicon_BufferedReader.readLine()) != null) {
			
			// skip blank lines
			if (lineFromCelexWordformLexicon.matches("^\\s*$")) {
				continue;
			}
			
			lineCounter++;
			if ((lineCounter%10000) == 0) {
				System.err.println("  " + lineCounter + " " + lineFromCelexWordformLexicon);
				System.err.flush();
			}
			
			// Each line is a list of 5 '\' separated fields. 
			// I am interested in the first, second, fourth, and fifth fields:
			//
			// 1) wordform ID
			// 2) wordform
			// 3) count of wordform in the 'Mannheim' corpus, useless
			// 4) lemmaID, to link this wordform up with its lemma/citation form
			// 5) comma separated celex feature codes
			//
			Pattern wordformPattern = 
				Pattern.compile("^(\\d+)\\\\([^\\\\]+)\\\\(\\d+)\\\\(\\d+)\\\\(.*)$"); 
			Matcher wordformMatcher = 
				wordformPattern.matcher(lineFromCelexWordformLexicon);
			boolean matches = wordformMatcher.matches();
		
			if ( ! matches) {
				System.err.println("The Celex Wordform lexiconfile is bad");
				System.err.println(" line: " + lineFromCelexWordformLexicon);
				System.err.println("  EXITING...");
				System.err.println();
				return;
			}
			
			String wordformIDAsString = wordformMatcher.group(1);
			int wordformID = Integer.valueOf(wordformIDAsString);
			
			String wordform = wordformMatcher.group(2);
			// skip wordforms containing a space
			if (wordform.matches("^.*\\s.*$")) {
				continue;
			}
			
			String lemmaIDAsString = wordformMatcher.group(4);
			int lemmaID = Integer.valueOf(lemmaIDAsString);
			CelexLemmaData celexLemmaDataOfWordForm = celexLemmaLexiconByLemmaID.get(lemmaID);
					
			String celexFeatureCodesAsString = wordformMatcher.group(5);
			
			if (celexWordformsByWordformID.containsKey(wordformID)) {
				System.err.println();
				System.err.println("Two words in the Celex wordform lexicon had the same");
				System.err.println("  wordformID -- this should not happen");
				System.err.println("    line: " + lineFromCelexWordformLexicon);
				System.err.println();
			}
			
			CelexWordformData celexWordformData = 
				new CelexWordformData(
						wordformID, 
						wordform, 
						celexLemmaDataOfWordForm,
						celexFeatureCodesAsString);
			
			celexWordformsByWordformID.put(wordformID, celexWordformData);			
		}
	}

	public BufferedReader openFileForReading(File fileToOpen) {
		
		BufferedReader bufferedReaderToReturn = null;
		
		try {
			bufferedReaderToReturn = 
				new BufferedReader(
						new InputStreamReader(
								new FileInputStream(fileToOpen),
							    "ISO-8859-1")); //latin 1
		}
		catch(FileNotFoundException e) {	
			System.err.println();
			System.err.println("  Sorry.  The file: " + fileToOpen.getAbsolutePath());
			System.err.println("    could not be read.  Here is the full Java error:");
			System.err.println();
			System.err.println(e.getMessage());
			System.err.println();
			System.err.println("  Did NOT successfully set the corpus path.");
			System.err.println();
			System.exit(0);
		}
		catch(Exception e) {
			System.err.println();
			System.err.println("  Sorry.  While opening the file: " + fileToOpen.getAbsolutePath());
			System.err.println("    an error was encountered.  Here is the full Java error:");
			System.err.println();
			System.err.println(e.getMessage());
			System.err.println();
			System.err.println("  Did NOT successfully set the corpus path.");
			System.err.println();
			System.exit(0);
		}
		
		return bufferedReaderToReturn;
	}
	
	private PrintWriter openFileForWriting(File fileToOpen, String encoding) {

		PrintWriter printWriterToReturn = null;
		
		try {
			printWriterToReturn = 
				new PrintWriter(
						new BufferedWriter(
								new OutputStreamWriter(
										new FileOutputStream(fileToOpen),
										encoding)),
						true); // true to autoflush
			
		} catch (FileNotFoundException e) {
			System.err.println();
			System.err.println("Cannot set the output file:");
			System.err.println("  " + fileToOpen.getAbsolutePath());
			System.err.println();
			System.exit(0);

		} catch (IOException e) {
			System.err.println("Failed to open the output file because");
			System.err.println("  of the following internal error:");
			e.printStackTrace();
			System.err.println();
			System.exit(0);
		}
		
		return printWriterToReturn;
	}

	
	/**
	 * @param args the command line arguments which must look like:
	 *
	 * <stem-data-file> <suffix-data-file>
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		if (args.length != 3) {
			System.out.println("The command line must look like:");
			System.out.println(
			 "    java AnalyzeCelexGermanIntoAMorphoChallengeAnswerKey " + String.format("%n") +
			 "        <path-to-CelexLemmaLexiconFile> " + String.format("%n") +
			 "        <path-to-CelexWordformsLexiconFile>" + String.format("%n") +
			 "        <path-to-MorphoChallengeAnswerKeyOutputFile>" + String.format("%n%n"));
			System.out.println("    Exiting...");
			System.out.println();
			System.out.println();
			System.exit(0);
		}
		
		AnalyzeCelexGermanIntoAMorphoChallengeAnswerKey converter = 
			new AnalyzeCelexGermanIntoAMorphoChallengeAnswerKey(
					new File(args[0]), 
					new File(args[1]),
					new File(args[2]));
		
		converter.convertFromCelexToMorphoChallenge();
		converter.lowercaseMorphoChallengeAnswerKeyEntries();
		converter.writeMorphoChallengeAnswerKeys();
	}    
	
	public void convertFromCelexToMorphoChallenge() {
		System.err.println();
		System.err.println("Converting from Celex style features to Morphology Challenge Style answer key format");
		System.err.println();
		
		int wordformCounter = 0;
		for (Integer wordformID : celexWordformsByWordformID.keySet()) {
			
			wordformCounter++;
			if ((wordformCounter % 10000) == 0) {
				System.err.println("  " + wordformCounter + " wordforms converted");
			}
			
			CelexWordformData celexWordformData =
				celexWordformsByWordformID.get(wordformID);
			
			List<MorphoChallengeAnalysis> morphoChallengeAnalysesForThisCelexWordformData =
				MorphoChallengeAnalysis.convertToListOfMorphoChallengeAnalyses(
						celexWordformData);
			
			for (MorphoChallengeAnalysis morphoChallengeAnalysis : 
						morphoChallengeAnalysesForThisCelexWordformData) {
				
				String wordform = morphoChallengeAnalysis.wordform;
				if ( ! morphoChallengeAnswerKeyEntriesByWordform.containsKey(wordform)) {
					
					morphoChallengeAnswerKeyEntriesByWordform.put(
							wordform,
							new MorphoChallengeAnswerKeyEntry(wordform));
				}
				MorphoChallengeAnswerKeyEntry morphoChallengeAnswerKeyEntry =
					morphoChallengeAnswerKeyEntriesByWordform.get(wordform);
				
				morphoChallengeAnswerKeyEntry.addMorphoChallengeAnalysis(
						morphoChallengeAnalysis);
			}
		}
	}
	
	// Sadly, the German corpus I have (from the Morphology Challenge 2007) is all
	// lower case. Hence the Morphology Challenge answer key needs to also be in
	// lower case. Morhology Challenge answer key entries may *collide* when they are
	// lower cased. So this must be handled.
	private void lowercaseMorphoChallengeAnswerKeyEntries() {
		
		System.err.println();
		System.err.println("Lower casing the MorpholChallengeAnswerKeyEntries");
		System.err.println();
		
		Map<String, MorphoChallengeAnswerKeyEntry> lowerCasedEntriesByLowerCasedWordform =
			new TreeMap<String, MorphoChallengeAnswerKeyEntry>();
		
		int answerKeyEntryCounter = 0;
		for (String caseSensitiveWordform : 
				morphoChallengeAnswerKeyEntriesByWordform.keySet()) {
			
			answerKeyEntryCounter++;
			if ((answerKeyEntryCounter % 10000) == 0) {
				System.err.println(
						answerKeyEntryCounter + 
						" Morphology Challenge Answer Key Entries have been lower cased");
			}
			
			String lowerCasedWordform = caseSensitiveWordform.toLowerCase();
			
			MorphoChallengeAnswerKeyEntry caseSensitiveAnswerKeyEntry = 
				morphoChallengeAnswerKeyEntriesByWordform.get(caseSensitiveWordform);
			
			MorphoChallengeAnswerKeyEntry lowerCasedAnswerKeyEntry =
				caseSensitiveAnswerKeyEntry.toLowerCase();
			
			if ( ! lowerCasedEntriesByLowerCasedWordform.containsKey(lowerCasedWordform)) {
				lowerCasedEntriesByLowerCasedWordform.put(
						lowerCasedWordform, 
						lowerCasedAnswerKeyEntry);
				
			// We must combine answer key entries that collided during case lowering
			} else { 
				
				MorphoChallengeAnswerKeyEntry squattingAnswerKeyEntry =
					lowerCasedEntriesByLowerCasedWordform.get(lowerCasedWordform);
				
				MorphoChallengeAnswerKeyEntry compositeAnswerKeyEntry =
					MorphoChallengeAnswerKeyEntry.combineTwoAnswerKeyEntries (
							squattingAnswerKeyEntry,
							lowerCasedAnswerKeyEntry);
								
				lowerCasedEntriesByLowerCasedWordform.put(
						lowerCasedWordform,
						compositeAnswerKeyEntry);
			}
		}
		
		morphoChallengeAnswerKeyEntriesByWordform = lowerCasedEntriesByLowerCasedWordform;
	}

	private void writeMorphoChallengeAnswerKeys() {
		
		System.err.println();
		System.err.println("Writing out the Morphology Challenge answer key");
		System.err.println();
		
		int wordformCounter = 0;
		for (MorphoChallengeAnswerKeyEntry morphoChallengeAnswerKeyEntry : 
			morphoChallengeAnswerKeyEntriesByWordform.values()) {
			
			wordformCounter++;
			if ((wordformCounter % 10000) == 0) {
				System.err.println("  " + wordformCounter + " wordforms written");
			}
			
			morphoChallengeAnswerKey_PrintWriter.println(
					morphoChallengeAnswerKeyEntry.toString());
		}
	}	
}

