package tokenizers;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.util.StringTokenizer;

import lexicon.analyse.XMLMorphAnalyzer;
import lexicon.utils.Translate;

import org.dom4j.Document;

/*
 * Created on 12/12/2005
 *
 * TODO To change the template for this generated file go to
 * Window - Preferences - Java - Code Style - Code Templates
 */

/**
 * @author daliabo
 * 
 * TODO To change the template for this generated type comment go to Window -
 * Preferences - Java - Code Style - Code Templates
 */
public class XMLTokenizer {
	static final String[] prefixArray = { "w", "wh", "e", "eh", "we", "weh",
			"ke", "keh", "wke", "wkeh", "weke", "wekeh", "eke", "ekeh", "me",
			"meh", "wmeh", "wemeh", "wme", "wmeh", "weme", "wemeh", "eme",
			"emeh", "lke", "lkeh", "wlke", "wlkeh", "welke", "welkeh", "elke",
			"elkeh", "b", "wb", "eb", "web", "keb", "wkeb", "ekeb", "wekeb",
			"meb", "wmeb", "emeb", "wemeb", "lkeb", "wlkeb", "elkeb", "welkeb",
			"l", "wl", "el", "wel", "kel", "wkel", "ekel", "wekel", "mel",
			"wmel", "emel", "wemel", "lkel", "wlkel", "elkel", "welkel", "k",
			"wk", "ek", "wek", "kek", "wkek", "ekek", "wekek", "mek", "wmek",
			"emek", "wemek", "lkek", "wlkek", "elkek", "welkek", "m", "mh",
			"wm", "em", "wem", "kem", "wkem", "ekem", "wekem", "mem", "wmem",
			"emem", "wemem", "lkem", "wlkem", "elkem", "welkem", "h", "lkeh" };

	final int MAX_EMPTY_LINES = 100;

	boolean emptyLineFlag = false;

	protected String inputFile = "";

	protected String outputFile = "";

	private InputStreamReader pIn = null;

	protected BufferedReader bi = null;

	int tokenCounter = 0;

	boolean emptyLine = false;

	String line = "";

	protected XMLProcessor xmlTokenizer = null;

	boolean mshwklbFlag = false;

	int lineCounter = 0;
	
	boolean webFlag = false;

	private void ioFileHandling() {
		if (!inputFile.equals("")) {
			try {
				//bi = new BufferedReader(new InputStreamReader(new
				// FileInputStream(
				//		inputFile), "UTF8"));
				bi = new BufferedReader(new InputStreamReader(
						new FileInputStream(inputFile), "UTF8"));
			} catch (UnsupportedEncodingException e) {
				System.out
						.println("XMLTokenizer:ioFileHandling UnsupportedEncodingException happened");
				e.printStackTrace();
			} catch (FileNotFoundException e) {
				System.out
						.println("XMLTokenizer:ioFileHandling File not find - please check input/output file parameter");
				e.printStackTrace();
			}
		} else {
			try {
				//	// Getting inputFile path+name from stdin
				bi = new BufferedReader(
						new InputStreamReader(System.in, "UTF8"));
			} catch (UnsupportedEncodingException e) {
				System.out
						.println("XMLTokenizer:ioFileHandling UnsupportedEncodingException happened");
				e.printStackTrace();
			}
		}
	}

	/**
	 * drops empty lines at the begining of the file. We allow maximum
	 * MAX_EMPTY_LINES empty lines at the begining of the file until text
	 * appears. If there are more than MAX_EMPTY_LINES we will assume that the
	 * file is empty. BOM is also handled, if there are empty lines after the
	 * BOM they are also removed until text is reached
	 * 
	 * @return first line whic is not empty
	 */
	private void dropStartFileEmptyLines() {
		int emptyLinesCounter = 0;
		try {
			line = bi.readLine();

			while ((line == null || line.equals(""))
					&& (emptyLinesCounter < MAX_EMPTY_LINES)) {
				line = bi.readLine();
				emptyLinesCounter++;
			}
			if (emptyLinesCounter == MAX_EMPTY_LINES) {
				System.err
						.println("No input to process or there are more than MAX_EMPTY_LINES at the begining of the file, Exiting");
				return;
			}

			//BOM handling - skip BOM and continue processing
			if (line.charAt(0) == 0xFEFF) {
				line = line.substring(1);
			}

			//handling case of empty lines after BOM
			emptyLinesCounter = 0;
			while ((line == null || line.equals(""))
					&& (emptyLinesCounter < MAX_EMPTY_LINES)) {
				line = bi.readLine();
				emptyLinesCounter++;
			}

			if (emptyLinesCounter == MAX_EMPTY_LINES) {
				System.err.println("No input to process");
				System.exit(-1);
			}

		} catch (IOException e) {
			System.out
					.println("Tokenizer:dropEmptyLines - IOException occured while trying to read input file lines");
			e.printStackTrace();
		} catch (Exception e) {
			System.out
					.println("Tokenizer:dropEmptyLines - Exception occured while trying to read input file lines");
			e.printStackTrace();
		}

		return;
	}

	/**
	 * Handling empty lines appear at the middle of the file. Empty lines
	 * separates between paragraphes. This is the way ynet articles separates
	 * paragraphes. After the last empty line and the first text line - we close
	 * the last paragraph and start a new paragraph and a new sentence. In case
	 * there are no more text lines - the new sentence tag will be removed.
	 * 
	 * @return
	 * @throws IOException
	 */
	private boolean dropMiddleFileEmptyLines() {
		if (line.equals("")) {
			try {
				line = bi.readLine();
			} catch (IOException e) {
				System.out
						.println("XMLTokenizer:dropMiddleFileEmptyLines - Exception in readLine line= "
								+ line);
				e.printStackTrace();
			}
			emptyLineFlag = true;
			return emptyLineFlag;
		}

		line = line.trim();

		if (emptyLineFlag) {
			if (lineCounter > 0) {
				xmlTokenizer.createParapraphes();
				xmlTokenizer.createSentences();
			}
			emptyLineFlag = false;
		}

		return emptyLineFlag;
	}

	/**
	 * dom4j is used for creating the xml document. this function handles
	 * creating the new xml document
	 * 
	 * @throws IOException
	 */
	private void createTokenizedXML() throws IOException {
		xmlTokenizer = new XMLProcessor();
		xmlTokenizer.createDocument();
		xmlTokenizer.createArticle();
	}

	private boolean identifyEnglish(String token, char first, char last) {
		boolean rt = false;
		//		identify urls, mail addresses etc
		if (((first >= 'a') && (first <= 'z'))
				&& ((last >= 'a') && (last <= 'z'))
				|| ((first >= 'A') && (first <= 'Z'))
				&& ((last >= 'A') && (last <= 'Z'))) {

			xmlTokenizer.createTokens(token);
			rt = true;
		}
		return rt;
	}

	private boolean mixedEnglishHebrew(String token, char first, char last,
			int len) {
		//System.out.println(token);
		boolean rt = false;
		int mark = 0;
		if (token.indexOf("–") != -1 || token.indexOf("-") != -1
				|| token.indexOf(":") != -1 || token.indexOf("/") != -1
				|| token.indexOf(")") != -1 || token.indexOf(",") != -1
				|| token.indexOf(".") != -1 || token.indexOf("+") != -1
				|| token.indexOf("=") != -1 || token.indexOf("\"") != -1
				|| token.indexOf("'") != -1 || token.indexOf("(") != -1
				|| token.indexOf("?") != -1)
			return rt;
		if ((((first >= 'a') && (first <= 'z')) && ((last >= 'א') && (last <= 'ת')))
				|| (((first >= 'A') && (first <= 'Z')) && ((last >= 'א') && (last <= 'ת')))
				|| (((first >= 'א') && (first <= 'ת')) && ((last >= 'a') && (last <= 'z')))
				|| (((first >= 'א') && (first <= 'ת')) && ((last >= 'A') && (last <= 'Z')))) {
			for (int j = 0; j < len; j++) {
				if (((token.charAt(j) >= 'א' && token.charAt(j) <= 'ת')
						|| (token.charAt(j) >= 'a' && token.charAt(j) <= 'z') || (token
						.charAt(j) >= 'A' && token.charAt(j) <= 'Z'))
						&& Character.isDigit(token.charAt(j + 1))) {
					mark = j + 1;
					j++;
					while (Character.isDigit(token.charAt(j)))
						j++;

					xmlTokenizer.createTokens(token.substring(0, mark));
					xmlTokenizer.createTokens(token.substring(mark, j));
					xmlTokenizer.createTokens(token.substring(j));
					return true;
				}

				if ((token.charAt(j) >= 'א' && token.charAt(j) <= 'ת'
						&& token.charAt(j + 1) >= 'a' && token.charAt(j + 1) <= 'z')
						|| (token.charAt(j) >= 'א' && token.charAt(j) <= 'ת'
								&& token.charAt(j + 1) >= 'A' && token
								.charAt(j + 1) <= 'Z')

						|| (token.charAt(j) >= 'a' && token.charAt(j) <= 'z'
								&& token.charAt(j + 1) >= 'א' && token
								.charAt(j + 1) <= 'ת')
						|| (token.charAt(j) >= 'A' && token.charAt(j) <= 'Z'
								&& token.charAt(j + 1) >= 'א' && token
								.charAt(j + 1) <= 'ת')) {

					xmlTokenizer.createTokens(token.substring(0, j + 1));
					xmlTokenizer.createTokens(token.substring(j + 1));
					rt = true;
					break;
				}
			}
		}
		return rt;
	}

	private boolean mixedNumbersHebrew(String token, int len) throws Exception {
		boolean rt = false;
		int i1 = 0;
		int j;
		if (Character.isDigit(token.charAt(0))
				&& ((token.indexOf("/") != -1) || (token.indexOf(".") != -1) || (token
						.indexOf(":") != -1)))
			return rt;
		if (Character.isDigit(token.charAt(0))
				&& Character.isDigit(token.charAt(len - 1))
				|| (token.indexOf("-") != -1) && token.indexOf("-") == 1)
			return rt;
		for (j = 0; j < len; j++)
			if ((!Character.isDigit(token.charAt(j)) && (j + 1 <= len - 1) && Character
					.isDigit(token.charAt(j + 1)))

					|| (Character.isDigit(token.charAt(j)) && (j + 1 < len - 1) && !Character
							.isDigit(token.charAt(j + 1)))) {
				rt = true;
				break;
			}
		if (rt) {
			String subSt1 = "";
			String number = "";
			String subSt2 = "";
			int n = j + 1;
			if (Character.isDigit(token.charAt(j + 1))) {
				while (n <= len - 1
						&& ((Character.isDigit(token.charAt(n)))
								|| (token.charAt(n) == ',')
								|| (token.charAt(n) == '.') || (token.charAt(n) == ':')))
					n++;
				subSt1 = token.substring(0, j + 1);
				number = token.substring(j + 1, n);
				if (!Character.isDigit(number.charAt(number.length() - 1))) {
					number = token.substring(j + 1, n - 1);
					subSt2 = token.substring(n - 1);
				} else
					subSt2 = token.substring(n);

				len = subSt1.length();
				if (len == 1)
					xmlTokenizer.createTokens(subSt1);
				else {
					i1 = analyzePrefixToken(subSt1, len);
					analyzeSuffixToken(subSt1, len, i1);
				}

				xmlTokenizer.createTokens(number);

				len = subSt2.length();
				if (len > 0) {

					i1 = analyzePrefixToken(subSt2, len);
					analyzeSuffixToken(subSt2, len, i1);

				}
			} else {
				number = token.substring(0, j + 1);
				subSt1 = token.substring(j + 1);
				len = subSt1.length();
				if (len == 1)
					xmlTokenizer.createTokens(subSt1);
				else {
					i1 = analyzePrefixToken(subSt1, len);
					analyzeSuffixToken(subSt1, len, i1);
				}

				xmlTokenizer.createTokens(number);
			}
		}

		return rt;
	}

	private boolean handleToken(String token) throws Exception {
		boolean rt = false;
		int len = token.length();
		char first = token.charAt(0);
		char last = token.charAt(len - 1);
		int i = 0;
		//handle token of length = 1 character like , in the following
		// sentence:
		//ילד , בא לכאן

		if (len == 1) {
			xmlTokenizer.createTokens(String.valueOf(first));
			//if we came across . or ! or ? - it signifies end of
			// sentence and start of a new one
			if ((first == '.') || (first == '?') || (first == '!'))
				xmlTokenizer.createSentences();
			rt = true;
			i++;

		} else {
			if ((mixedEnglishHebrew(token, first, last, len)))
				return true;
			if (identifyEnglish(token, first, last))
				return true;

			if (mixedNumbersHebrew(token, len))
				return true;

			//in corpus arutz 7 there are many cases in which appears
			// token.token without separation
			if (seperateWords(token))
				return true;

			/////////////////////////////////////////////////
			//handle the ... case
			if ((first == '.') && (last == '.') ) {
				xmlTokenizer.createTokens(token);
				return true;
			}

		}
		return rt;
	}

	private void handleSubWord(String subSt) {
		//System.out.println(subSt);
		int len = subSt.length();
		char first = subSt.charAt(0);
		char last = subSt.charAt(len - 1);
		int i1 = 0;

		if (len == 1)

			xmlTokenizer.createTokens(subSt);

		else {

			try {
				i1 = analyzePrefixToken(subSt, len);
			} catch (Exception e2) {
				// TODO Auto-generated catch block
				e2.printStackTrace();
			}
			try {
				analyzeSuffixToken(subSt, len, i1);
			} catch (Exception e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
	}

	private boolean moshevkaleb(String possiblePrefix) {
		boolean rt = false;
		//String possiblePrefix = analyzedString.substring(0, i);
		possiblePrefix = Translate.Heb2Eng(possiblePrefix);
		for (int j = 0; j < prefixArray.length; j++)
			if (prefixArray[j].equals(possiblePrefix)) {
				rt = true;
				break;
			}
		return rt;
	}

	private boolean seperateWords(String token) {
		boolean rt2 = false;
		boolean rt1 = false;
		int len;
		char first;
		char last;
		int index = 0;
		int i = 0;
		int i1 = 0;
		boolean rt = false;
		int limit = 3;
		boolean prefixFlag = false;
		boolean stopLoopingFlag = false;
		char c = 0;
		len = token.length() - 1;
		System.out.println(token);
		while (index <= len) {
			c = token.charAt(index);
			if (c < 'א' || c > 'ת') {
				switch (c) {
				case '"':
					if(len==1){
						xmlTokenizer.createTokens(token);
						return true;
					}
					else if ((index + 2 < len)
							&& ((token.charAt(index + 2) == '.')
									|| (token.charAt(index + 2) == '?')
									|| (token.charAt(index + 2) == '"')
									|| (token.charAt(index + 2) == '—')
									|| (token.charAt(index + 2) == '-') || (token
									.charAt(index + 2) == ':'))
							|| ((token.endsWith("יים") && (index == token
									.length() - 5))
									|| (token.endsWith("ים") && (index == token
											.length() - 4))
									|| (token.endsWith("ית") && (index == token
											.length() - 4))
									|| (token.endsWith("יות") && (index == token
											.length() - 5))
									|| (token.endsWith("ניק") && (index == token
											.length() - 5))
									|| (token.endsWith("ניקים") && (index == token
											.length() - 7))
									|| (token.endsWith("ניקיות") && (index == token
											.length() - 8))
									|| (token.endsWith("יסט") && (index == token
											.length() - 5))
									|| (token.endsWith("יסטית") && (index == token
											.length() - 7)) || (token
									.endsWith("יסטיות") && (index == token
									.length() - 8)))) {
						index++;
					} else if (((index + 1 < len) && token.charAt(index + 1) == '.')
							|| ((index - 1 >= 0) && ((token.charAt(index - 1) >= 'א' && token
									.charAt(index - 1) <= 'ת')
									|| (token.charAt(index - 1) >= 'A' && token
											.charAt(index - 1) <= 'Z') || (token
									.charAt(index - 1) >= 'a' && token
									.charAt(index - 1) <= 'z'))))

						stopLoopingFlag = true;
					else
						index++;
					break;

				//1,00,000
				case ',':
					if ((index != len) && (index+1 <=len) && Character.isDigit(token.charAt(index + 1))) {
						xmlTokenizer.createTokens(token);
						return true;
					} else
						stopLoopingFlag = true;

				case ':':
				case '/':
				//handle 13.6.2006 35,000 13/6/2006
				case '.':
					if (((index + 2 <= len) && token.charAt(index + 2) == '.')
							|| ((index + 3 <= len) && token.charAt(index + 3) == '.'))
						return false;
					else if ((index + 1 < len) && (index - 1 > 0)
							&& Character.isDigit(token.charAt(index - 1))
							&& Character.isDigit(token.charAt(index + 1)))
						index++;
					else
						stopLoopingFlag = true;
					break;
				//handle 13/6/2006
				case '\'':
					if (mshwklbFlag = moshevkaleb(token.substring(0,token.length()-2))){
						xmlTokenizer.createTokens(token.substring(0,token.length()-2));
						xmlTokenizer.createTokens(token.substring(token.length()-2));
						return true;
					}
					//א'
					else if ((token.length() == 2) && (token.charAt(i+1)>='א' && token.charAt(i+1)<='ת' ) &&  
							!(token.charAt(i+1)>='A' && token.charAt(i+1)<='Z' )
							 ) {
						xmlTokenizer.createTokens(token);
						return true;
					} else if (((index - 1) >= 0)
							&& ((token.charAt(index - 1) == 'ג')
									|| (token.charAt(index - 1) == 'ד')
									|| (token.charAt(index - 1) == 'ז')
									|| (token.charAt(index - 1) == 'צ')
									|| (token.charAt(index - 1) == 'ת') || (token
									.charAt(index - 1) == 'ע')))
						index++;
					else if (token.length() == 2 && (token.charAt(i+1)< 'א' || token.charAt(i+1)>'ת')
							&& (token.charAt(i+1)<'A' || token.charAt(i+1)>'Z')){
						xmlTokenizer.createTokens(String.valueOf(c));
						xmlTokenizer.createTokens(String.valueOf(token.charAt(index+1)));
						return true;
					}
					else
						stopLoopingFlag = true;
					break;
				case '-':
					//	if ((index + 3 < len) && token.charAt(index + 3) == '-')
					//		return false;
					//	else
					stopLoopingFlag = true;
					break;
				case '=':
				case '–':
				case '(':
				case ')':
				case '—':
				case '?':
				case '[':
					stopLoopingFlag = true;
					break;
				default:
					index++;
				}
			} else
				index++;
			if (stopLoopingFlag == true)
				break;
		}

		if (stopLoopingFlag) {
			rt = true;
			String subSt1 = "";
			String subSt2 = "";
			len = token.length() - 1;

			if (c == '\"') {
				//handle acronyms
				if (((index == len - 2)) && (index - 1 > 0)
						&& token.charAt(index - 1) >= 'א'
						&& token.charAt(index - 1) <= 'ת')
					return false;
				subSt1 = token.substring(0, index);
				subSt2 = token.substring(index + 1);
				if (subSt2.length() >= 2) {
					mshwklbFlag = moshevkaleb(subSt1);
					if (mshwklbFlag && (subSt2.length() > 0)) {
						prefixFlag = true;
						xmlTokenizer.createTokens("prefix=" + subSt1);
						xmlTokenizer.createTokens(String.valueOf(c));
						rt2 = seperateWords(subSt2);
						if (!rt2)
							handleSubWord(subSt2);
					}
				}
			}

			subSt1 = token.substring(0, index + 1);
			//System.out.println(subSt1);
			subSt2 = token.substring(index + 1);
			//System.out.println(subSt2);
			if (c == '-' || c == ',' || c == '–' || c == '=' || c == '\'')
				limit = 1;

			if (!prefixFlag) {
				if (subSt1.length() >= limit && subSt2.length() >= limit) {
					//rt1 = seperateWords(subSt1);
					//if (!rt1)
					handleSubWord(subSt1);
					rt2 = seperateWords(subSt2);
					if (!rt2)
						handleSubWord(subSt2);
				} else
					rt = false;
			}
		}
		return rt;

	}

	/**
	 * This is the control function: it handles reading input file,
	 * initialization of xml document analyzing line by line. each token is
	 * analyzed as follows: it's prefix and suffix are analyzed char by char
	 * looking for non hebrew chars than it's inside is analyzed for hyphen char
	 * 
	 * @return
	 * @throws Exception
	 */
	public int newProcess() throws Exception {
		String token = "";
		int len = 0;
		int i = 0;
		int index;

		//handling empty lines at the begining of the file
		dropStartFileEmptyLines();
		//creating infrastructure for creating xml document
		createTokenizedXML();
		//create the first paragraph
		xmlTokenizer.createParapraphes();
		//create the first sentence
		xmlTokenizer.createSentences();
		//analyzing the input file line by line
		while (line != null) {
			//System.out.println("line=" + line);
			line = line.trim();
			//handling the case of empty lines at the middle of the file - they
			// signify a
			//begining of a new paragraph
			if (dropMiddleFileEmptyLines())
				continue;

			//analyzing the line tokens, tokens are separated from each other
			// by white spaces
			StringTokenizer st = new StringTokenizer(line);
			while (st.hasMoreTokens()) {
				token = st.nextToken();
				//System.out.println("token=" + token);
				//token length
				len = token.length();

				if (handleToken(token))
					continue;

				int i1 = analyzePrefixToken(token, len);
				analyzeSuffixToken(token, len, i1);
			}
			lineCounter++;
			line = bi.readLine();
			if (webFlag && xmlTokenizer.getGlobalTokenCounter()>100)
			break;
		}
		xmlTokenizer.printDoc(outputFile);
		bi.close();
		return tokenCounter;
	}

	/**
	 * web interface
	 * 
	 * @return
	 * @throws Exception
	 * @throws Exception
	 * @throws Exception
	 */
	public String process() throws Exception {
		String rt = "";
		String token = "";
		int len = 0;
		int i = 0;

		//handling empty lines at the begining of the file
		dropStartFileEmptyLines();
		try {
			//creating infrastructure for creating xml document
			createTokenizedXML();
		} catch (IOException e) {
			System.out
					.println("XMLTokenizer:process - Exception during createTokenizedXML");
			e.printStackTrace();
		}
		//create the first paragraph
		xmlTokenizer.createParapraphes();
		//create the first sentence
		xmlTokenizer.createSentences();
		//analyzing the input file line by line
		while (line != null) {
			//System.out.println("line=" + line);
			line = line.trim();
			//handling the case of empty lines at the middle of the file - they
			// signify a
			//begining of a new paragraph
			if (dropMiddleFileEmptyLines())
				continue;

			//analyzing the line tokens, tokens are separated from each other
			// by white spaces
			StringTokenizer st = new StringTokenizer(line);
			while (st.hasMoreTokens()) {
				token = st.nextToken();
				//System.out.println("token=" + token);
				len = token.length();

				if (handleToken(token))
					continue;

				int i1 = analyzePrefixToken(token, len);
				analyzeSuffixToken(token, len, i1);
			}
			lineCounter++;
			line = bi.readLine();
			if (webFlag && xmlTokenizer.getGlobalTokenCounter()>100)
				break;
		}
		rt = xmlTokenizer.printDoc();

		return rt;
	}

	/**
	 * This function will analyze the token from left to right looking for non
	 * hebrew chars
	 * 
	 * @param token -
	 *            original token
	 * @param len -
	 *            token original length
	 * @param i1 -
	 *            the location of the first hebrew char when analyzing the token
	 *            from right to left
	 * @throws Exception
	 */
	private void analyzeSuffixToken(String token, int len, int i1) {

		if (i1 <= len - 1) {
			char c;
			int i = len - 1;
			while (i > 0 && i >= i1) {
				c = token.charAt(i);

				if (((c < 'א' || (c > 'ת')) && !Character.isLetter(c) && !Character
						.isDigit(c))) {
					// - גרש בסוף מילה לא נתן לדעת אם מדובר בחלק מהמילה או במילה
					// נתונה בגרשיים

					if (c == '\'') {
					
						 if ((Character.isDigit(token.charAt(i - 1)))) {
							i--;
							continue;
						} else if ((token.charAt(i - 1) == 'ג')
								|| (token.charAt(i - 1) == 'ד')
								|| (token.charAt(i - 1) == 'ז')
								|| (token.charAt(i - 1) == 'צ')
								|| (token.charAt(i - 1) == 'ת')
								|| (token.charAt(i - 1) == 'ע')
								|| ((token.charAt(i - 1) >= 'א'  &&  token.charAt(i - 1) <='ת') 
								&& (token.charAt(i - 2) < 'א' || token.charAt(i - 2) > 'ת')))
								
							break;
					}
					i--;
				} else
					break;
			}

			//System.out.println(token);
			String analyzedString = token.substring(i1, i + 1);
			xmlTokenizer.createTokens(token.substring(i1, i + 1));

			for (int j = i + 1; j <= len - 1; j++) {
				//handling ...
				c = token.charAt(j);
				if ((c != '.') && (c != '?') && (c != '!')) {
					xmlTokenizer.createTokens(String.valueOf(token.charAt(j)));
				} else if (c == '.') {
					int k = j;
					while ((k <= (len - 1)) && (token.charAt(k) == '.'))
						k++;
					xmlTokenizer.createTokens(token.substring(j, k));
					xmlTokenizer.createSentences();
				} else {
					xmlTokenizer.createTokens(String.valueOf(token.charAt(j)));
					xmlTokenizer.createSentences();
				}
			}

		}
	}

	private boolean moshevkaleb(String analyzedString, int i) {
		boolean rt = false;
		String possiblePrefix = analyzedString.substring(0, i);
		possiblePrefix = Translate.Heb2Eng(possiblePrefix);
		for (int j = 0; j < prefixArray.length; j++)
			if (prefixArray[j].equals(possiblePrefix)) {
				rt = true;
				break;
			}
		return rt;
	}

	/**
	 * analyzing the token from right to left looking for non hebrew chars
	 * 
	 * @param token
	 * @param len
	 * @return
	 * @throws Exception
	 */
	private int analyzePrefixToken(String token, int len) {
		char c;
		int i = 0;
		c = token.charAt(i);
		while ((i < len)) {
			//handling ...
			//הי�?ד...
			c = token.charAt(i);

			if (c == '.') {
				int j = i;
				while ((j < len) && (token.charAt(j) == '.'))
					j++;
				xmlTokenizer.createTokens(token.substring(i, j));
				i = j;
			}

			else if ((c < 'א' || c > 'ת') && !Character.isDigit(c)
					&& !Character.isLetter(c)) {
				xmlTokenizer.createTokens(String.valueOf(c));
				i++;
			} else {
				break;
			}
		}

		return i;
	}
	
//	For Users who uses data files
	public void tokenizeAndAnalyze(InputStream in, PrintWriter pw, String dinflectionsFile, String dprefixesFile, String gimatriaFile ) {
		System.out.println("Starting XMLTokenizer");
		Document document = null;
		java.io.OutputStream myOut = null;
		try {
			bi = new BufferedReader(new InputStreamReader(in, "UTF8"));
			process();
			xmlTokenizer.finalizeDoc();
			document = xmlTokenizer.getDocument();

		} catch (Exception e) {
			System.out
					.println("XMLTokenizer:tokenizeAndAnalyze -web interface - Exception in BufferedReader or process function");
			e.printStackTrace();
		}

		XMLMorphAnalyzer xmlMorphAnalyzer = new XMLMorphAnalyzer();
		xmlMorphAnalyzer.morphologicalAnalyzer(pw, 2, document,dinflectionsFile, dprefixesFile, gimatriaFile);

	}

	//XML analyzer interface
	public void tokenizeAndAnalyze(InputStream in, PrintWriter pw) {
		System.out.println("Starting XMLTokenizer");
		Document document = null;
		java.io.OutputStream myOut = null;
		try {
			bi = new BufferedReader(new InputStreamReader(in, "UTF8"));
			process();
			xmlTokenizer.finalizeDoc();
			document = xmlTokenizer.getDocument();

		} catch (Exception e) {
			System.out
					.println("XMLTokenizer:tokenizeAndAnalyze -web interface - Exception in BufferedReader or process function");
			e.printStackTrace();
		}

		XMLMorphAnalyzer xmlMorphAnalyzer = new XMLMorphAnalyzer();
		xmlMorphAnalyzer.morphologicalAnalyzer(pw, 2, document,"", "", "");

	}

	//	XSL analyzer interface
	public void tokenizeAndAnalyze(InputStream in, PrintWriter pw,
			String XSLFile) {
		System.out.println("Starting XMLTokenizer");
		webFlag = true;
		Document document = null;
		java.io.OutputStream myOut = null;
		try {
			bi = new BufferedReader(new InputStreamReader(in, "UTF8"));
			process();
			xmlTokenizer.finalizeDoc();
			document = xmlTokenizer.getDocument();

		} catch (Exception e) {
			System.out
					.println("XMLTokenizer:tokenizeAndAnalyze -web interface - Exception in BufferedReader or process function");
			e.printStackTrace();
		}

		XMLMorphAnalyzer xmlMorphAnalyzer = new XMLMorphAnalyzer();
		xmlMorphAnalyzer.morphologicalAnalyzer(pw, 2, true, document, XSLFile);
	}

	//web interface
	public String myTokenizer(InputStream in) {
		System.out.println("Starting XMLTokenizer");
		String outputXML = "";
		try {

			bi = new BufferedReader(new InputStreamReader(in, "UTF8"));
			outputXML = process();

		} catch (Exception e) {
			System.out
					.println("XMLTokenizer:myTokenizer-web interface - Exception in BufferedReader or process function ");
			e.printStackTrace();
		}

		System.out.println(outputXML);
		return outputXML;
	}
	
	//API for the users (for Meni)
	public OutputStream  tokenize(InputStream in) {
		String outputXML = "";
		try {

			bi = new BufferedReader(new InputStreamReader(in, "UTF8"));
			outputXML = process();

		} catch (Exception e) {
			System.out
					.println("XMLTokenizer:myTokenizer-web interface - Exception in BufferedReader or process function ");
			e.printStackTrace();
		}

		//System.out.println(outputXML);
		byte[] byteArray = null;
		try {
			byteArray = outputXML.getBytes("UTF-8");
		} catch (UnsupportedEncodingException e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		}
		ByteArrayOutputStream out = new ByteArrayOutputStream(byteArray.length);
		try
		{
        //Write the data to that stream
		out.write(byteArray);
		} catch(Exception e)
		{
		e.printStackTrace();
		}
		//Cast to OutputStream and return
		return (OutputStream) out;
	}

	//users interface
	public void myTokenizer(String inputFile, String outputFile) {
		System.out.println("Starting XMLTokenizer");
		this.inputFile = inputFile;
		System.out.println(inputFile);
		this.outputFile = outputFile;
		System.out.println(outputFile);
		try {
			ioFileHandling();
			newProcess();
		} catch (FileNotFoundException e) {
			System.out
					.println("XMLTokenizer:myTokenizer File not find - please check input/output file parameter");
			e.printStackTrace();
			System.exit(0);
		} catch (UnsupportedEncodingException e) {
			System.out
					.println("XMLTokenizer:myTokenizer UnsupportedEncodingException happened");
			e.printStackTrace();
			System.exit(0);
		} catch (Exception e) {
			System.out.println("XMLTokenizer:myTokenizer Exception happend");
			e.printStackTrace();
			System.exit(0);
		}

	}

	public static void main(String[] args) {
		//System.out.println("Starting XMLTokenizer");
		XMLTokenizer t = new XMLTokenizer();
		
		////////////////////////////////////////
//		String st ="ילד קטן הלך לגן";
//		InputStream inputStraem = null;
//		try {
//			inputStraem = new ByteArrayInputStream(st.getBytes("UTF8"));
//		} catch (UnsupportedEncodingException e) {
//			// TODO Auto-generated catch block
//			e.printStackTrace();
//		}
////		t.tokenize(inputStraem);
//		
//		StringWriter sw = new StringWriter();
//        PrintWriter pw = new PrintWriter(sw);
//      
//		//t.tokenizeAndAnalyze(inputStraem,  pw,  "C:\\Documents and Settings\\daliabo\\My Documents\\lexicon\\diffTests\\dinflections.data", 
//		//		"C:\\Documents and Settings\\daliabo\\My Documents\\lexicon\\diffTests\\dprefixes.data", "C:\\Documents and Settings\\daliabo\\My Documents\\lexicon\\diffTests\\gimatria.data");
//		
//		t.tokenizeAndAnalyze(inputStraem,  pw,  "dinflections.data", 
//				"dprefixes.data", "gimatria.data");
//		
//		  String result = sw.toString();
//	       System.out.println(result);

		///////////////////////////////////////////
		
		
		
		if (args.length > 1) {
			t.inputFile = args[0];
			System.out.println("inputFile=" + t.inputFile);
			t.outputFile = args[1];
			System.out.println("outputFile=" + t.outputFile);
		} else {
			t.inputFile = "";
			t.outputFile = "";
		}
		try {
			t.ioFileHandling();
			t.newProcess();
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}