package lexicon.analyse;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;

import lexicon.dbUtils.InflectRecord;
import lexicon.dbUtils.Inflections;
import lexicon.dbUtils.PrefixRecord;
import lexicon.dbUtils.Prefixes;
import lexicon.utils.Messages;
import lexicon.utils.Translate;
import corpus.CreateCorpusXML;

/**
 * This library is free software; you can redistribute it and/or modify it under
 * the terms of the GPL
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE.
 * 
 * This class is the main class of the morphological analyzer <br>
 * For running it follow the listed instructions: <br>
 * arguments to the pc program : pc inputFile outputFile arguments to the UNIX
 * program : No arguments, the input and output can be redirected <br>
 * See analyzer.properties as an example. <br>
 * The input file to be analyzed must be saved as UTF-8 file (you can use
 * notepad for creating it) it also must be a tokenized file <br>
 * run the program by : java -jar analyzer.jar <br>
 * <p>
 * Main Flow: <br>
 * load the inflections For each line from the input file{ <br>
 * verify if the token is a number (create the relevant analysis) <br>
 * verify if the token is a punctuation (create the relevant analysis) <br>
 * get all the relevant analysis (query the inflections table and create the
 * relevant analysys) <br>
 * scan the token and verify if there exist relevant prefix + base combinations
 * <br>
 * (query the inflections table and the prefixes table) <br>
 * <p>
 * 
 * @author dalia bojan
 * @version 1.0
 */
public abstract class MorphAnalyzer implements Constants {

	static protected boolean newTokenFlag = true;

	static protected boolean webFlag;

	//variables for I/O handling
	static protected String dinflectionsFile = "";

	static protected String dprefixesFile = "";

	static protected String gimatriasFile = "";

	static protected BufferedWriter bw = null;

	static protected OutputStreamWriter pOut = null;

	static protected InputStreamReader pIn = null;

	static protected BufferedReader bi = null;

	//private OutputStream outputStream;

	//word holds the input token to be analyzed
	static protected String word = "";

	//Holds the hebrew form of the relevant lexicon item
	static protected String undotted = "";

	//holds the part of speech
	static protected String pos = "";

	//holds the lexicon item id
	static protected String id = "";

	//holds the relevant lexicon item id
	static protected String transliterated = "";

	//The following class variables hold lexicon item attributes
	static protected String script = "";

	static protected String gender = "";

	static protected String number = "";

	static protected String baseQuantifierType = "";

	static protected String baseNamedEntityType = "";

	static protected String baseConjunctionType = "";

	static protected String basePronounType = "";

	static protected String basePerson = "";

	static protected String PGN = "";

	static protected String binyan = "";

	static protected int binyani;

	static protected String tense = "";

	static protected String root = "";

	static protected String baseDefinitness;

	static protected String baseGender = "";

	static protected String baseNumber = "";

	static protected String construct = "";

	//holds the analysis to be output to output file/stdout
	static protected StringBuffer output;

	//holds prefix record downloded from prefixes table
	static protected PrefixRecord pr = null;

	//The following class variables hold prefix attributes valuse as defined in
	// the prefix table
	static protected boolean definiteArticleTag;

	static protected boolean defArtHE;

	static protected boolean relHE;

	static protected boolean prefPartUnitTag;

	static protected boolean subConOrRelSHIN;

	static protected boolean tempSubConKAFSHIN;

	static protected boolean tempSubConMEMSHIN;

	static protected boolean tempSubConLAMEDKAFSHIN;

	static protected boolean tempSubConBETSHIN;

	static protected boolean relativizerTag;

	static protected boolean subordinatingConjunctionTag;

	static protected boolean temporalSubConjTag;

	static protected boolean conjunctionTag;

	static protected boolean prepBET;

	static protected boolean prepKAF;

	static protected boolean prepLAMED;

	static protected boolean prepMEM;

	static protected boolean adverbKAF;

	static protected boolean prefPartUnit;

	static protected boolean prepositionTag;

	static protected int posi;

	static protected int constructi;

	static protected int tensei;

	static protected int suffixFunctioni;

	static protected int baseQuantifierTypei;

	protected int basePronounTypei;

	static protected int baseDefinitnessi;

	//holds appended output of the analysis
	static protected StringBuffer totalOutput = new StringBuffer();

	//holds input token
	static protected String hebWord = "";

	//the following holds the names of the input and output files in case of
	// running in pc
	protected static String inputFile = "";

	protected static String outputFile = "";

	//a flag to determine whether the input token has been found in the
	// inflections pattern
	static protected boolean foundWord = false;

	//a flag to determine whether the input word was analysed to be composed of
	// prefix + lema
	static protected boolean foundBase = false;

	// A counter of the input tokens
	static protected int tokensCount = 0;

	//inflections hash table handle
	protected static Inflections inflections = null;

	//prefixes hash table handle
	protected static Prefixes prefixes = null;

	static protected int outputPattern;

	static protected InflectRecord outputInflectionRec;

	//static protected Data data = null;

	static protected CreateCorpusXML createXML = null;

	static protected int outputType;

	static protected int scripti;

	//protected Gimatria gimatria = new Gimatria();

	protected abstract void analyzeBaseNoPrefix() throws Exception;

	protected abstract void buildGemetriaOutput(int value);

	protected abstract void analyzeBase(String base, String prefix)
			throws Exception;

	protected abstract boolean analyzePrefixGimatriaAndInvertedCommas(
			String base, String prefix);

	protected abstract void handlePrefix(String prefix);

	protected boolean analyzeURL() throws Exception {
		boolean isURL = false;
		if (((hebWord.indexOf("@") != -1) && (hebWord.indexOf(".") != -1))
				|| (hebWord.startsWith("http://") || hebWord.startsWith("www.") || hebWord
						.startsWith("ftp://"))) {
			isURL = true;
			outputPattern = URL;
		}
		return isURL;
	}

	protected boolean analyzePrefix() throws Exception {
		boolean isPrefix = false;
		if (hebWord.startsWith("prefix=") || hebWord.endsWith("prefix=")) {
			handlePrefix(hebWord);
			isPrefix = true;
		}
		return isPrefix;
	}

	protected boolean analyzeForeign() throws Exception {
		boolean isForeign = false;

		//System.out.println("hebWord="+ hebWord);
		char char0 = hebWord.charAt(0);
		if ((char0 >= 'a' && char0 <= 'z') || (char0 >= 'A' && char0 <= 'Z')) {
			isForeign = true;
			outputPattern = FOREIGN;
		}
		return isForeign;
	}

	protected boolean analyzeNumbers() throws Exception {
		boolean isNumber = false;
		if (Character.isDigit((char) hebWord.charAt(0))) {
			if ((hebWord.indexOf("/") == -1) && (hebWord.indexOf(":") == -1)
					&& (hebWord.lastIndexOf(".") == hebWord.indexOf("."))) {
				outputPattern = LITERAL_NUMBERS;
				isNumber = true;
			}
		}
		return isNumber;
	}

	protected boolean analyzePunctuations() throws Exception {
		final String punctuations = "=.!;|}{][*^&%#@~$/+:()-,?_'";
		boolean punctuation = false;
		int wordLength = word.length();
		int index;

		//in case like '

		if (wordLength == 1) {
			if (word.charAt(0) == ('"'))
				punctuation = true;
			int punctuationsSize = punctuations.length();
			//analyze punctuations list for the relevant punctuation
			for (int i = 0; i < punctuationsSize; i++) {
				if (((index = word.indexOf(punctuations.charAt(i))) != -1)) {
					punctuation = true;
				}
			}
		} else if (word.startsWith(".") && word.endsWith(".")) {
			punctuation = true;
		}
		if (punctuation) {
			outputPattern = PUNCTUATION;
		}
		return punctuation;
	}

	protected boolean noEntryInInflections() {
		if (!foundBase && !foundWord) {
			outputPattern = NO_ENTRY;
			return true;
		} else {
			foundBase = false;
			foundWord = false;
			return false;
		}
	}

	protected String apostropheInvertedCommasHandling() {
		int gimatriaVal = -1;
		if (word.endsWith("\'")) {
			//analyze the whole token
			try {
				gimatriaVal = Data.getGimatrias(word);
				if (gimatriaVal != -1) {
					//look for the word with the apostrophe at the end
					foundWord = true;
					buildGemetriaOutput(gimatriaVal);
				} else {
					analyzeBaseNoPrefix();
					analyzeAcronymsBaseAndPrefix();
				}
			} catch (Exception e1) {
				// TODO Auto-generated catch block
				e1.printStackTrace();
			}
			//seperate the apostrophe
			if ((!foundWord) && (!foundBase)) {
				//separate to two tokens - apostrophe and '
				word = word.substring(0, word.length() - 1);
				hebWord = hebWord.substring(0, word.length());
				try {
					analyzeBaseNoPrefix();
					//handle the ' as a seperate token
					if (foundWord) {
						word = hebWord = "'";
						analyzePunctuations();
					}
					analyzeBaseAndPrefix();
					//handle the ' as a seperate token
					if (foundBase) {
						word = hebWord = "'";
						analyzePunctuations();
					}
					noEntryInInflections();
				} catch (Exception e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
			}
		} else if (word.indexOf("\"") == (word.length() - 2)) {
			try {
				gimatriaVal = Data.getGimatrias(word);
				if (gimatriaVal != -1) {
					foundWord = true;
					buildGemetriaOutput(gimatriaVal);
				} else {
					analyzeBaseNoPrefix();
					analyzeBaseAndPrefix();
					analyzeAcronymsBaseAndPrefix();
				}
			} catch (UnsupportedEncodingException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			} catch (Exception e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
		return word;
	}

	/**
	 * analyze an entry from the prefixes table
	 */
	protected void getPrefixesAttributes() {
		definiteArticleTag = pr.isDefiniteArticleTag();
		defArtHE = pr.isDefArtHE();
		relHE = pr.isRelHE();
		prefPartUnitTag = pr.isPrefPartUnit();
		adverbKAF = pr.isAdverbKAF();
		subConOrRelSHIN = pr.isSubConOrRelSHIN();
		tempSubConKAFSHIN = pr.isTempSubConKAFSHIN();
		tempSubConBETSHIN = pr.isTempSubConBETSHIN();
		tempSubConMEMSHIN = pr.isTempSubConMEMSHIN();
		tempSubConLAMEDKAFSHIN = pr.isTempSubConLAMEDKAFSHIN();
		relativizerTag = pr.isRelativizerTag();
		subordinatingConjunctionTag = pr.isSubordinatingConjunctionTag();
		temporalSubConjTag = pr.isTemporalSubConjTag();
		prepBET = pr.isPrepBET();
		prepKAF = pr.isPrepKAF();
		prepLAMED = pr.isPrepLAMED();
		prepMEM = pr.isPrepMEM();
		prefPartUnit = pr.isPrefPartUnit();
		prepositionTag = pr.isPrepositionTag();
		conjunctionTag = pr.isConjunctionTag();
	}

	protected boolean gimatriaPossibility() {
		boolean rt = false;
		if ((word.indexOf("\"") == (word.length() - 2)) || (word.endsWith("'")))
			rt = true;
		return rt;
	}

	protected boolean validateByRules() {
		getPrefixesAttributes();
		boolean validate = false;
		if (!((definiteArticleTag || defArtHE || relHE) && (posi == INTERROGATIVE))
				&& !((definiteArticleTag || defArtHE) && (posi == CONJUNCTION))
				&& !((definiteArticleTag || defArtHE) && (posi == QUANTIFIER) && (baseQuantifierTypei == BASE_QUANTIFIER_TYPE_NON_NUMERAL))
				&& !(definiteArticleTag && constructi == CONSTRUCT_TRUE)
				&& !(definiteArticleTag && posi != VERB && suffixFunctioni == SUFFIX_FUNCTION_POSSESSIVE)
				&& !(definiteArticleTag && posi == VERB && (tensei != TENSE_INFINITIVE))
				&& !(definiteArticleTag && posi == INTERJECTION)
				&& !(definiteArticleTag && posi == INTERROGATIVE)
				&& !(definiteArticleTag && basePronounTypei == BASE_PRONOUN_TYPE_INTERROGATIVE)
				&& !(definiteArticleTag && posi == ADVERB)
				&& !(definiteArticleTag && posi == PROPERNAME)
				&& !(prefPartUnit && posi == VERB && tensei == TENSE_IMPERATIVE)
				&& !(adverbKAF && (posi == PROPERNAME || posi == VERB
						|| posi == NOUN || posi == ADJECTIVE || posi == PRONOUN
						|| posi == PREPOSITION || posi == ADVERB
						|| posi == CONJUNCTION || posi == INTERJECTION
						|| posi == INTERROGATIVE || posi == NEGATION || posi == PARTICIPLE))
				&& !((subConOrRelSHIN || tempSubConKAFSHIN || tempSubConMEMSHIN || tempSubConLAMEDKAFSHIN)
						&& (posi == CONJUNCTION) || posi == INTERJECTION || posi == INTERROGATIVE)
				&& !((relativizerTag || subordinatingConjunctionTag || temporalSubConjTag) && posi == CONJUNCTION)
				&& !(tempSubConBETSHIN && posi == PROPERNAME)
				&& !((prepBET || prepKAF || prepLAMED) && definiteArticleTag && baseDefinitnessi == BASE_DEFINITNESS_FALSE)
				&& !((prefPartUnit)
						&& baseDefinitnessi == BASE_DEFINITNESS_TRUE_TRUE
						&& !prepMEM && !subConOrRelSHIN && !conjunctionTag)
				&& !((prefPartUnit)
						&& baseDefinitnessi == BASE_DEFINITNESS_FALSE
						&& definiteArticleTag )
				&& !((prepBET || prepKAF || prepLAMED || prepMEM) && (posi == VERB && (tensei == TENSE_PAST
						|| tensei == TENSE_FUTURE || tensei == TENSE_IMPERATIVE)))
				&& !(prepLAMED && posi == VERB && tensei == TENSE_INFINITIVE)
				&& !(prepositionTag && posi == VERB && (tensei == TENSE_BARE_INFINITIVE))
				//&& !(prepositionTag && posi == PARTICIPLE)
				&& !(prepositionTag && posi == INTERJECTION)
				&& !(prepositionTag && posi == INTERROGATIVE)
				&& !(prepositionTag && posi == ADVERB)
				&& !(prepositionTag && basePronounTypei == BASE_PRONOUN_TYPE_PERSONAL)
				&& !((prepBET || prepKAF || prepLAMED || prepMEM) && posi == PREPOSITION)
				&& !((relativizerTag || subordinatingConjunctionTag || temporalSubConjTag) && posi == INTERJECTION)
				&& !(prepositionTag && posi == CONJUNCTION)
		//&& !((conjunctionTag || definiteArticleTag || subConOrRelSHIN
		//		|| tempSubConKAFSHIN || tempSubConMEMSHIN || tempSubConLAMEDKAFSHIN)
		//		&& posi == VERB && tensei == TENSE_INFINITIVE)
		)
			validate = true;

		return validate;
	}

	protected boolean validateByRulesWithoutPrefixes() {
		boolean validate = false;
		if (!(word.startsWith("w") && !word.startsWith("ww") && scripti == SCRIPT_FORMAL)
				&& !(word.startsWith("w") && !word.startsWith("ww")
						&& scripti == SCRIPT_COLLOQUIAL && suffixFunctioni == SUFFIX_FUNCTION_POSSESSIVE))
			validate = true;
		return validate;
	}

	protected void analyzeBaseAndPrefix() throws Exception {
		boolean returnValue = false;
		String punctuation = "";
		//the longest prefix know is 6 chars long
		for (int i = 1; i < 6 && i < word.length(); i++) {
			String prefix = word.substring(0, i);
			//'
			if (prefix.indexOf("'") != -1)
				return;
			String base = word.substring(i);
			analyzeBase(base, prefix);
		}
	}

	protected void analyzeAcronymsBaseAndPrefix() throws Exception {
		boolean returnValue = false;
		String punctuation = "";
		//the longest prefix know is 6 chars long
		for (int i = 1; i < 6 && i < word.length() && word.charAt(i) != '"'
				&& word.charAt(i) != '\''; i++) {
			String prefix = word.substring(0, i);
			String base = word.substring(i);
			analyzePrefixGimatriaAndInvertedCommas(base, prefix);
		}
	}

	protected void handleInputParameters(String[] args) {
		int argc = args.length;
		String format;
		StringBuffer webFlagSt = new StringBuffer();
		switch (argc) {
		case 6:

			//webFlagSt = webFlagSt.append(
			//		Character.toUpperCase(args[0].charAt(0))).append(
			//		args[0].substring(1).toLowerCase());
			webFlag = (new Boolean(args[0]).booleanValue());
			Messages.printErrMessage("webFlag=", webFlag);
			inputFile = args[1];
			Messages.printErrMessage("input File=", inputFile);
			outputFile = args[2];
			Messages.printErrMessage("output File=", outputFile);
			dinflectionsFile = args[3];
			Messages.printErrMessage("dinflections File=", dinflectionsFile);
			dprefixesFile = args[4];
			Messages.printErrMessage("dprefixes File=", dprefixesFile);
			gimatriasFile = args[5];
			Messages.printErrMessage("gimatrias File=", gimatriasFile);
			break;
		case 5:

			//webFlagSt = webFlagSt.append(
			//		Character.toUpperCase(args[0].charAt(0))).append(
			//		args[0].substring(1).toLowerCase());
			webFlag = (new Boolean(args[0]).booleanValue());
			Messages.printErrMessage("webFlag=", webFlagSt);
			inputFile = args[1];
			Messages.printErrMessage("input File=", inputFile);
			dinflectionsFile = args[2];
			Messages.printErrMessage("dinflections File=", dinflectionsFile);
			dprefixesFile = args[3];
			Messages.printErrMessage("dprefixes File=", dprefixesFile);
			gimatriasFile = args[4];
			Messages.printErrMessage("gimatrias File=", gimatriasFile);
			break;
		case 4:

			//webFlagSt = webFlagSt.append(
			//		Character.toUpperCase(args[0].charAt(0))).append(
			//		args[0].substring(1).toLowerCase());
			webFlag = (new Boolean(args[0]).booleanValue());
			Messages.printErrMessage("webFlag=", webFlagSt);
			if (!webFlag) {
				outputFile = "";
				inputFile = "";
				dinflectionsFile = args[1];
				Messages
						.printErrMessage("dinflections File=", dinflectionsFile);
				dprefixesFile = args[2];
				Messages.printErrMessage("dprefixes File=", dprefixesFile);
				gimatriasFile = args[3];
				Messages.printErrMessage("gimatrias File=", gimatriasFile);
			} else {
				inputFile = args[1];
				Messages.printErrMessage("input File=", inputFile);
				outputFile = args[2];
				Messages.printErrMessage("output File=", outputFile);
			}
			break;
		case 3:
			webFlag = (new Boolean(args[0]).booleanValue());
			Messages.printErrMessage("webFlag=", webFlag);
			inputFile = args[1];
			Messages.printErrMessage("input File=", inputFile);
			outputFile = args[2];
			Messages.printErrMessage("output File=", outputFile);
			break;
		default:
			Messages.printErrMessage("wrong parameters number");
			System.exit(0);
		}

	}

	protected void inputFileHandling() {
		try {
			bi = new BufferedReader(new InputStreamReader(new FileInputStream(
					inputFile), "UTF8"));
		} catch (UnsupportedEncodingException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			System.err.println("input File not found:  " + inputFile);
			e.printStackTrace();
			System.exit(1);
		}
	}

	protected long printTimesHandling(long startTime) {
		long afterLoadTime = System.currentTimeMillis();
		long load2MemoryElapsedTime = afterLoadTime - startTime;
		if (!webFlag)
			System.err.println("load2Memory Elapsed time = "
					+ load2MemoryElapsedTime + " ms");
		return afterLoadTime;
	}

	protected void readInput() throws IOException, Exception {
		if (!analyzePrefix())
			if (!analyzeURL())
				if (!analyzeForeign()) {
					word = Translate.Heb2Eng(hebWord);
					if (!analyzePunctuations()) {
						if (!analyzeNumbers()) {
							if (gimatriaPossibility()) {
								apostropheInvertedCommasHandling();
							} else {
								analyzeBaseNoPrefix();
								analyzeBaseAndPrefix();
								noEntryInInflections();
							}
						}
					}
				}
	}
}